From dde88e78a1e94f238aee052b61dece532af0c919 Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Wed, 31 Jan 2024 13:41:30 +0100 Subject: [PATCH 01/48] Code changes related to unused PARKIND1 JPRB --- src/trans/algor/interpol_decomp_mod.F90 | 2 +- src/trans/algor/set99.F90 | 2 +- src/trans/algor/set99b.F90 | 2 +- src/trans/external/get_current.F90 | 2 +- src/trans/external/setup_trans.F90 | 2 +- src/trans/external/specnorm.F90 | 9 +++------ src/trans/external/sugawc.F90 | 2 +- src/trans/include/ectrans/specnorm.h | 4 ++-- src/trans/include/ectrans/sugawc.h | 2 +- src/trans/internal/abort_trans_mod.F90 | 7 +++---- src/trans/internal/cpledn_mod.F90 | 2 +- src/trans/internal/dealloc_resol_mod.F90 | 2 +- src/trans/internal/dist_grid_32_ctl_mod.F90 | 2 +- src/trans/internal/ectrans_version_mod.F90.in | 2 +- src/trans/internal/field_split_mod.F90 | 2 +- src/trans/internal/gawl_mod.F90 | 2 +- src/trans/internal/inigptr_mod.F90 | 2 +- src/trans/internal/myrecvset_mod.F90 | 2 +- src/trans/internal/mysendset_mod.F90 | 2 +- src/trans/internal/pe2set_mod.F90 | 2 +- src/trans/internal/set2pe_mod.F90 | 2 +- src/trans/internal/set_resol_mod.F90 | 2 +- src/trans/internal/setup_dims_mod.F90 | 2 +- src/trans/internal/shuffle_mod.F90 | 2 +- src/trans/internal/spnorm_ctl_mod.F90 | 4 ++-- src/trans/internal/sugaw_mod.F90 | 4 ++-- src/trans/internal/sump_trans0_mod.F90 | 2 +- src/trans/internal/sump_trans_preleg_mod.F90 | 2 +- src/trans/internal/sumplatf_mod.F90 | 2 +- src/trans/internal/supol_mod.F90 | 4 ++-- src/trans/internal/supolf_mod.F90 | 2 +- src/trans/internal/tpm_constants.F90 | 4 ++-- src/trans/internal/tpm_ctl.F90 | 1 - src/trans/internal/tpm_dim.F90 | 2 +- src/trans/internal/tpm_gen.F90 | 2 +- src/trans/internal/tpm_geometry.F90 | 4 ++-- src/trans/internal/tpm_pol.F90 | 2 +- src/trans/internal/trgtol_mod.F90 | 4 ++-- src/trans/internal/trltog_mod.F90 | 4 ++-- src/trans/sharedmem/sharedmem_mod.F90 | 3 +-- 40 files changed, 51 insertions(+), 57 deletions(-) diff --git a/src/trans/algor/interpol_decomp_mod.F90 b/src/trans/algor/interpol_decomp_mod.F90 index e805275b..338e47af 100644 --- a/src/trans/algor/interpol_decomp_mod.F90 +++ b/src/trans/algor/interpol_decomp_mod.F90 @@ -23,7 +23,7 @@ MODULE INTERPOL_DECOMP_MOD ! Author: Mats Hamrud -USE PARKIND1, ONLY : JPRB, JPIM, JPRD, JPIB +USE EC_PARKIND, ONLY : JPIM, JPRD, JPIB IMPLICIT NONE CONTAINS !=========================================================================== diff --git a/src/trans/algor/set99.F90 b/src/trans/algor/set99.F90 index 3df7f64c..8065a674 100644 --- a/src/trans/algor/set99.F90 +++ b/src/trans/algor/set99.F90 @@ -33,7 +33,7 @@ SUBROUTINE SET99(TRIGS,IFAX,N) IF (LHOOK) CALL DR_HOOK('SET99',0,ZHOOK_HANDLE) IXXX=1 ! - DEL=4.0E0_JPRB*ASIN(1.0E0_JPRB)/REAL(N,KIND=JPRB) + DEL=4.0_JPRB * ASIN(1.0_JPRB)/REAL(N,KIND=JPRB) NIL=0 NHL=(N/2)-1 DO 10 K=NIL,NHL diff --git a/src/trans/algor/set99b.F90 b/src/trans/algor/set99b.F90 index 27eb6230..6aafca71 100644 --- a/src/trans/algor/set99b.F90 +++ b/src/trans/algor/set99b.F90 @@ -34,7 +34,7 @@ SUBROUTINE SET99B(TRIGS,IFAX,N,LDUSEFFT992) ! IXXX=1 ! - DEL=4.0E0_JPRB*ASIN(1.0E0_JPRB)/REAL(N,KIND=JPRB) + DEL=4.0_JPRB * ASIN(1.0_JPRB)/REAL(N,KIND=JPRB) NIL=0 NHL=(N/2)-1 DO 10 K=NIL,NHL diff --git a/src/trans/external/get_current.F90 b/src/trans/external/get_current.F90 index 85d94b23..8b751b4e 100644 --- a/src/trans/external/get_current.F90 +++ b/src/trans/external/get_current.F90 @@ -41,7 +41,7 @@ SUBROUTINE GET_CURRENT(KRESOL,LDLAM) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM +USE EC_PARKIND ,ONLY : JPIM !ifndef INTERFACE diff --git a/src/trans/external/setup_trans.F90 b/src/trans/external/setup_trans.F90 index 7bd34edd..13132e6f 100644 --- a/src/trans/external/setup_trans.F90 +++ b/src/trans/external/setup_trans.F90 @@ -218,7 +218,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& G%LREDUCED_GRID = .FALSE. -G%RSTRET=1.0_JPRB +G%RSTRET=1.0_JPRD D%LGRIDONLY = .FALSE. D%LSPLIT = .FALSE. D%LCPNMONLY=.FALSE. diff --git a/src/trans/external/specnorm.F90 b/src/trans/external/specnorm.F90 index 169a520d..1e032da7 100644 --- a/src/trans/external/specnorm.F90 +++ b/src/trans/external/specnorm.F90 @@ -8,8 +8,8 @@ ! nor does it submit to any jurisdiction. ! -SUBROUTINE SPECNORM(PSPEC,KVSET,KMASTER,KRESOL,PMET,PNORM) +SUBROUTINE SPECNORM(PNORM,PSPEC,KVSET,KMASTER,KRESOL,PMET) !**** *SPECNORM* - Compute global spectral norms ! Purpose. @@ -64,11 +64,11 @@ SUBROUTINE SPECNORM(PSPEC,KVSET,KMASTER,KRESOL,PMET,PNORM) ! Declaration of arguments +REAL(KIND=JPRB) , INTENT(OUT) :: PNORM(:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPEC(:,:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KMASTER REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PMET(:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PNORM(:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL !ifndef INTERFACE @@ -114,9 +114,6 @@ SUBROUTINE SPECNORM(PSPEC,KVSET,KMASTER,KRESOL,PMET,PNORM) ENDIF ENDIF IF(MYPROC == IMASTER) THEN - IF(.NOT. PRESENT(PNORM)) THEN - CALL ABORT_TRANS('SPECNORM: PNORM NOT PRESENT') - ENDIF IF(UBOUND(PNORM,1) < IFLD_G) THEN CALL ABORT_TRANS('SPECNORM: PNORM TOO SMALL') ENDIF @@ -133,7 +130,7 @@ SUBROUTINE SPECNORM(PSPEC,KVSET,KMASTER,KRESOL,PMET,PNORM) ENDIF ENDIF -CALL SPNORM_CTL(PSPEC,IFLD,IFLD_G,KVSET,IMASTER,PMET,PNORM) +CALL SPNORM_CTL(PNORM,PSPEC,IFLD,IFLD_G,KVSET,IMASTER,PMET) !endif INTERFACE diff --git a/src/trans/external/sugawc.F90 b/src/trans/external/sugawc.F90 index b32a73f4..8f23f491 100644 --- a/src/trans/external/sugawc.F90 +++ b/src/trans/external/sugawc.F90 @@ -46,7 +46,7 @@ SUBROUTINE SUGAWC(KDGLG,PMU,PW) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPRD, JPIM +USE EC_PARKIND ,ONLY : JPRD, JPIM !ifndef INTERFACE diff --git a/src/trans/include/ectrans/specnorm.h b/src/trans/include/ectrans/specnorm.h index 159177c2..29fa8112 100644 --- a/src/trans/include/ectrans/specnorm.h +++ b/src/trans/include/ectrans/specnorm.h @@ -9,7 +9,7 @@ ! INTERFACE -SUBROUTINE SPECNORM(PSPEC,KVSET,KMASTER,KRESOL,PMET,PNORM) +SUBROUTINE SPECNORM(PNORM,PSPEC,KVSET,KMASTER,KRESOL,PMET) !**** *SPECNORM* - Compute global spectral norms @@ -55,11 +55,11 @@ IMPLICIT NONE ! Declaration of arguments +REAL(KIND=JPRB) , INTENT(OUT) :: PNORM(:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPEC(:,:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KMASTER REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PMET(:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PNORM(:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL ! ------------------------------------------------------------------ diff --git a/src/trans/include/ectrans/sugawc.h b/src/trans/include/ectrans/sugawc.h index 383db3c9..13bc0a0e 100644 --- a/src/trans/include/ectrans/sugawc.h +++ b/src/trans/include/ectrans/sugawc.h @@ -45,7 +45,7 @@ SUBROUTINE SUGAWC(KDGLG,PMU,PW) ! -------------- ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRD +USE EC_PARKIND ,ONLY : JPIM ,JPRD ! ------------------------------------------------------------------ diff --git a/src/trans/internal/abort_trans_mod.F90 b/src/trans/internal/abort_trans_mod.F90 index aee35f53..aebc5b8a 100644 --- a/src/trans/internal/abort_trans_mod.F90 +++ b/src/trans/internal/abort_trans_mod.F90 @@ -13,8 +13,7 @@ MODULE ABORT_TRANS_MOD SUBROUTINE ABORT_TRANS(CDTEXT) USE TPM_GEN , ONLY : NOUT,NERR -USE TPM_DISTR, ONLY : NPROC,MYPROC -USE MPL_MODULE, ONLY : MPL_ABORT +USE MPL_MODULE, ONLY : MPL_ABORT, MPL_RANK, MPL_NUMPROC USE SDL_MOD, ONLY : SDL_TRACEBACK, SDL_SRLABORT IMPLICIT NONE @@ -25,9 +24,9 @@ SUBROUTINE ABORT_TRANS(CDTEXT) WRITE(NOUT,'(1X,A)') 'ABORT_TRANS CALLED' WRITE(NOUT,'(1X,A)') CDTEXT -WRITE(NERR,'(1X,A,1X,I3,1X,A)') 'ABORT! ',MYPROC,CDTEXT +WRITE(NERR,'(1X,A,1X,I3,1X,A)') 'ABORT! ',MPL_RANK,CDTEXT CLOSE(NOUT) -IF (NPROC > 1) THEN +IF (MPL_NUMPROC > 1) THEN CALL MPL_ABORT(CDTEXT) ELSE CALL SDL_TRACEBACK diff --git a/src/trans/internal/cpledn_mod.F90 b/src/trans/internal/cpledn_mod.F90 index 9b60b18e..17f7504c 100644 --- a/src/trans/internal/cpledn_mod.F90 +++ b/src/trans/internal/cpledn_mod.F90 @@ -63,7 +63,7 @@ SUBROUTINE CPLEDN(KN,KODD,PFN,PX,KFLAG,PW,PXN,PXMOD) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPRD, JPIM +USE EC_PARKIND ,ONLY : JPRD, JPIM ! ------------------------------------------------------------------ diff --git a/src/trans/internal/dealloc_resol_mod.F90 b/src/trans/internal/dealloc_resol_mod.F90 index ff6d88c3..8f708f31 100644 --- a/src/trans/internal/dealloc_resol_mod.F90 +++ b/src/trans/internal/dealloc_resol_mod.F90 @@ -41,7 +41,7 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_DIM ,ONLY : R USE TPM_GEN ,ONLY : LENABLED, NOUT,NDEF_RESOL diff --git a/src/trans/internal/dist_grid_32_ctl_mod.F90 b/src/trans/internal/dist_grid_32_ctl_mod.F90 index add86893..61465eb1 100644 --- a/src/trans/internal/dist_grid_32_ctl_mod.F90 +++ b/src/trans/internal/dist_grid_32_ctl_mod.F90 @@ -43,7 +43,7 @@ SUBROUTINE DIST_GRID_32_CTL(PGPG,KFDISTG,KPROMA,KFROM,PGP) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB ,JPRM +USE PARKIND1 ,ONLY : JPIM ,JPRM USE MPL_MODULE USE TPM_DISTR ,ONLY : D, NPROC, MYPROC, NPRCIDS, MTAGDISTGP diff --git a/src/trans/internal/ectrans_version_mod.F90.in b/src/trans/internal/ectrans_version_mod.F90.in index 88cae2da..026c0675 100644 --- a/src/trans/internal/ectrans_version_mod.F90.in +++ b/src/trans/internal/ectrans_version_mod.F90.in @@ -26,7 +26,7 @@ CONTAINS !**** *ECTRANS_VERSION_INT* - Return ecTrans version as an integer - USE PARKIND1 ,ONLY : JPIM + USE EC_PARKIND ,ONLY : JPIM INTEGER(KIND=JPIM) :: ECTRANS_VERSION_INT diff --git a/src/trans/internal/field_split_mod.F90 b/src/trans/internal/field_split_mod.F90 index daa2282d..41773df1 100644 --- a/src/trans/internal/field_split_mod.F90 +++ b/src/trans/internal/field_split_mod.F90 @@ -58,7 +58,7 @@ SUBROUTINE FIELD_SPLIT(KBLK,KF_GP,KKF_UV_G,KVSETUV,KVSETSC,& ! Original : 01-01-03 ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_GEN ,ONLY : NPROMATR !USE TPM_TRANS diff --git a/src/trans/internal/gawl_mod.F90 b/src/trans/internal/gawl_mod.F90 index c188e59f..b42178f0 100644 --- a/src/trans/internal/gawl_mod.F90 +++ b/src/trans/internal/gawl_mod.F90 @@ -61,7 +61,7 @@ SUBROUTINE GAWL(PFN,PL,PW,PEPS,KN,KITER,PMOD) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPRD, JPIM +USE EC_PARKIND ,ONLY : JPRD, JPIM USE CPLEDN_MOD ,ONLY : CPLEDN diff --git a/src/trans/internal/inigptr_mod.F90 b/src/trans/internal/inigptr_mod.F90 index f30b44a1..1c3fe72d 100644 --- a/src/trans/internal/inigptr_mod.F90 +++ b/src/trans/internal/inigptr_mod.F90 @@ -14,7 +14,7 @@ SUBROUTINE INIGPTR(KGPTRSEND,KGPTRRECV) ! Compute tables to assist GP to/from Fourier space transpositions -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_GEN ,ONLY : NOUT USE TPM_DISTR ,ONLY : D, NPRTRNS diff --git a/src/trans/internal/myrecvset_mod.F90 b/src/trans/internal/myrecvset_mod.F90 index ba439c4e..948b6754 100644 --- a/src/trans/internal/myrecvset_mod.F90 +++ b/src/trans/internal/myrecvset_mod.F90 @@ -52,7 +52,7 @@ FUNCTION MYRECVSET(KSETS,KMYSET,KSET) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS ! diff --git a/src/trans/internal/mysendset_mod.F90 b/src/trans/internal/mysendset_mod.F90 index 7db5c526..86c02bd5 100644 --- a/src/trans/internal/mysendset_mod.F90 +++ b/src/trans/internal/mysendset_mod.F90 @@ -50,7 +50,7 @@ FUNCTION MYSENDSET(KSETS,KMYSET,KSET) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS IMPLICIT NONE diff --git a/src/trans/internal/pe2set_mod.F90 b/src/trans/internal/pe2set_mod.F90 index f1703a9b..2197ef06 100644 --- a/src/trans/internal/pe2set_mod.F90 +++ b/src/trans/internal/pe2set_mod.F90 @@ -70,7 +70,7 @@ SUBROUTINE PE2SET(KPE,KPRGPNS,KPRGPEW,KPRTRW,KPRTRV) ! Revision : 98-10-13 row ordering ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_DISTR ,ONLY : LEQ_REGIONS, NPRGPEW, NPROC, NPRTRV USE EQ_REGIONS_MOD ,ONLY : N_REGIONS, N_REGIONS_NS diff --git a/src/trans/internal/set2pe_mod.F90 b/src/trans/internal/set2pe_mod.F90 index 1a403e31..595d5c62 100644 --- a/src/trans/internal/set2pe_mod.F90 +++ b/src/trans/internal/set2pe_mod.F90 @@ -67,7 +67,7 @@ SUBROUTINE SET2PE(KPE,KPRGPNS,KPRGPEW,KPRTRW,KPRTRV) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_DISTR ,ONLY : LEQ_REGIONS, NPRGPEW, NPRGPNS, NPRTRV, NPRTRW USE EQ_REGIONS_MOD ,ONLY : N_REGIONS, N_REGIONS_NS diff --git a/src/trans/internal/set_resol_mod.F90 b/src/trans/internal/set_resol_mod.F90 index 0cf67081..2bf8f70d 100644 --- a/src/trans/internal/set_resol_mod.F90 +++ b/src/trans/internal/set_resol_mod.F90 @@ -11,7 +11,7 @@ MODULE SET_RESOL_MOD CONTAINS SUBROUTINE SET_RESOL(KRESOL,LDSETUP) -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_GEN ,ONLY : NOUT, MSETUP0, NCUR_RESOL, NMAX_RESOL,LENABLED USE TPM_DIM ,ONLY : R, DIM_RESOL diff --git a/src/trans/internal/setup_dims_mod.F90 b/src/trans/internal/setup_dims_mod.F90 index d8178c2d..97449f9b 100644 --- a/src/trans/internal/setup_dims_mod.F90 +++ b/src/trans/internal/setup_dims_mod.F90 @@ -12,7 +12,7 @@ MODULE SETUP_DIMS_MOD CONTAINS SUBROUTINE SETUP_DIMS -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_DIM ,ONLY : R USE TPM_FLT ,ONLY : S diff --git a/src/trans/internal/shuffle_mod.F90 b/src/trans/internal/shuffle_mod.F90 index 074b3510..7c1ec14c 100644 --- a/src/trans/internal/shuffle_mod.F90 +++ b/src/trans/internal/shuffle_mod.F90 @@ -55,7 +55,7 @@ SUBROUTINE SHUFFLE(KF_UV_G,KF_SCALARS_G,KSHFUV_G,KIVSETUV,KSHFSC_G,KIVSETSC,& ! Original : 01-01-03 ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_DISTR ,ONLY : NPRTRV ! diff --git a/src/trans/internal/spnorm_ctl_mod.F90 b/src/trans/internal/spnorm_ctl_mod.F90 index 154e5dc5..786bcec7 100644 --- a/src/trans/internal/spnorm_ctl_mod.F90 +++ b/src/trans/internal/spnorm_ctl_mod.F90 @@ -10,7 +10,7 @@ MODULE SPNORM_CTL_MOD CONTAINS -SUBROUTINE SPNORM_CTL(PSPEC,KFLD,KFLD_G,KVSET,KMASTER,PMET,PNORM) +SUBROUTINE SPNORM_CTL(PNORM,PSPEC,KFLD,KFLD_G,KVSET,KMASTER,PMET) USE PARKIND1 ,ONLY : JPIM ,JPRB @@ -23,11 +23,11 @@ SUBROUTINE SPNORM_CTL(PSPEC,KFLD,KFLD_G,KVSET,KMASTER,PMET,PNORM) IMPLICIT NONE +REAL(KIND=JPRB) , INTENT(OUT) :: PNORM(:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPEC(:,:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KMASTER REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PMET(:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PNORM(:) INTEGER(KIND=JPIM) , INTENT(IN) :: KFLD,KFLD_G INTEGER(KIND=JPIM) :: IVSET(KFLD_G) REAL(KIND=JPRB) :: ZMET(0:R%NSMAX) diff --git a/src/trans/internal/sugaw_mod.F90 b/src/trans/internal/sugaw_mod.F90 index c7d1bb6a..98b856cd 100644 --- a/src/trans/internal/sugaw_mod.F90 +++ b/src/trans/internal/sugaw_mod.F90 @@ -12,8 +12,7 @@ MODULE SUGAW_MOD CONTAINS SUBROUTINE SUGAW(KDGL,KM,KN,PL,PW,PANM,PFN) -USE PARKIND1 ,ONLY : JPRD, JPIM -USE PARKIND2 ,ONLY : JPRH +USE EC_PARKIND ,ONLY : JPRD, JPIM USE TPM_CONSTANTS ,ONLY : RA @@ -108,6 +107,7 @@ SUBROUTINE SUGAW(KDGL,KM,KN,PL,PW,PANM,PFN) ! computations in extended precision for alternative root finding ! which also works for associated polynomials (m>0) +INTEGER, PARAMETER :: JPRH = JPRD REAL(KIND=JPRH) :: ZLK, ZLK1, ZLLDN, ZANM REAL(KIND=JPRH) :: ZTHETA, ZTHETA0, ZX, ZX0, ZDX0, ZH, ZPIH, ZS0 REAL(KIND=JPRH) :: ZK1, ZK2, ZK3, ZK4 diff --git a/src/trans/internal/sump_trans0_mod.F90 b/src/trans/internal/sump_trans0_mod.F90 index d24d8c46..54ed0825 100644 --- a/src/trans/internal/sump_trans0_mod.F90 +++ b/src/trans/internal/sump_trans0_mod.F90 @@ -14,7 +14,7 @@ SUBROUTINE SUMP_TRANS0 ! Set up distributed environment for the transform package (part 0) -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE MPL_MODULE ,ONLY : MPL_GROUPS_CREATE, MPL_MYRANK, MPL_NPROC USE TPM_GEN ,ONLY : NOUT, LMPOFF, NPRINTLEV diff --git a/src/trans/internal/sump_trans_preleg_mod.F90 b/src/trans/internal/sump_trans_preleg_mod.F90 index 7bd9806a..c02a86be 100644 --- a/src/trans/internal/sump_trans_preleg_mod.F90 +++ b/src/trans/internal/sump_trans_preleg_mod.F90 @@ -14,7 +14,7 @@ SUBROUTINE SUMP_TRANS_PRELEG ! Set up distributed environment for the transform package (part 1) -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_GEN ,ONLY : NOUT, NPRINTLEV USE TPM_DIM ,ONLY : R diff --git a/src/trans/internal/sumplatf_mod.F90 b/src/trans/internal/sumplatf_mod.F90 index 1b4f1fd3..3de226e4 100644 --- a/src/trans/internal/sumplatf_mod.F90 +++ b/src/trans/internal/sumplatf_mod.F90 @@ -75,7 +75,7 @@ SUBROUTINE SUMPLATF(KDGL,KPROCA,KMYSETA,& ! not lelam features in new routine sumplatb.F ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_GEOMETRY ,ONLY : G diff --git a/src/trans/internal/supol_mod.F90 b/src/trans/internal/supol_mod.F90 index 1b5bf791..327ec60e 100644 --- a/src/trans/internal/supol_mod.F90 +++ b/src/trans/internal/supol_mod.F90 @@ -62,8 +62,8 @@ SUBROUTINE SUPOL(KNSMAX,PDDMU,PFN,PDDPOL) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPRD, JPIM -USE TPM_POL ,ONLY : DDI, DDA, DDH, DDE, DDC, DDD +USE EC_PARKIND ,ONLY : JPRD, JPIM +USE TPM_POL ,ONLY : DDI, DDA, DDH, DDE, DDC, DDD IMPLICIT NONE diff --git a/src/trans/internal/supolf_mod.F90 b/src/trans/internal/supolf_mod.F90 index 1a0ba787..eb3e25b4 100644 --- a/src/trans/internal/supolf_mod.F90 +++ b/src/trans/internal/supolf_mod.F90 @@ -59,7 +59,7 @@ SUBROUTINE SUPOLF(KM,KNSMAX,DDMU,DDPOL,KCHEAP) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPRD, JPIM +USE EC_PARKIND ,ONLY : JPRD, JPIM USE TPM_POL ,ONLY : DFI, DFB, DFG, DFA, DFF diff --git a/src/trans/internal/tpm_constants.F90 b/src/trans/internal/tpm_constants.F90 index 1f72a4b2..0502f77b 100644 --- a/src/trans/internal/tpm_constants.F90 +++ b/src/trans/internal/tpm_constants.F90 @@ -9,12 +9,12 @@ ! MODULE TPM_CONSTANTS -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE EC_PARKIND ,ONLY : JPRD IMPLICIT NONE SAVE -REAL(KIND=JPRB) :: RA ! Radius of Earth +REAL(KIND=JPRD) :: RA ! Radius of Earth END MODULE TPM_CONSTANTS diff --git a/src/trans/internal/tpm_ctl.F90 b/src/trans/internal/tpm_ctl.F90 index a5ee258f..b7be06f5 100644 --- a/src/trans/internal/tpm_ctl.F90 +++ b/src/trans/internal/tpm_ctl.F90 @@ -10,7 +10,6 @@ MODULE TPM_CTL -USE PARKIND1 ,ONLY : JPIM ,JPRB USE, INTRINSIC :: iso_c_binding, ONLY: C_PTR, C_NULL_PTR USE SHAREDMEM_MOD ,ONLY : SHAREDMEM IMPLICIT NONE diff --git a/src/trans/internal/tpm_dim.F90 b/src/trans/internal/tpm_dim.F90 index 4d56f922..3f3ca3c4 100644 --- a/src/trans/internal/tpm_dim.F90 +++ b/src/trans/internal/tpm_dim.F90 @@ -12,7 +12,7 @@ MODULE TPM_DIM ! Module for dimensions. -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE diff --git a/src/trans/internal/tpm_gen.F90 b/src/trans/internal/tpm_gen.F90 index 1f5e22df..3ea42afc 100644 --- a/src/trans/internal/tpm_gen.F90 +++ b/src/trans/internal/tpm_gen.F90 @@ -12,7 +12,7 @@ MODULE TPM_GEN ! Module for general control variables. -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE diff --git a/src/trans/internal/tpm_geometry.F90 b/src/trans/internal/tpm_geometry.F90 index 2c6e2ff0..48454a37 100644 --- a/src/trans/internal/tpm_geometry.F90 +++ b/src/trans/internal/tpm_geometry.F90 @@ -12,7 +12,7 @@ MODULE TPM_GEOMETRY ! Module containing data describing Gaussian grid. -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE EC_PARKIND ,ONLY : JPIM ,JPRD IMPLICIT NONE @@ -27,7 +27,7 @@ MODULE TPM_GEOMETRY LOGICAL :: LAM ! LAM geometry if T, Global geometry if F LOGICAL :: LREDUCED_GRID ! Reduced Gaussian grid if T ! quadratic Gaussian grid otherwise. -REAL(KIND=JPRB) :: RSTRET ! Stretching factor (for Legendre polynomials +REAL(KIND=JPRD) :: RSTRET ! Stretching factor (for Legendre polynomials ! computed on stretched latitudes only) END TYPE GEOM_TYPE diff --git a/src/trans/internal/tpm_pol.F90 b/src/trans/internal/tpm_pol.F90 index 448a0f9f..f563d960 100644 --- a/src/trans/internal/tpm_pol.F90 +++ b/src/trans/internal/tpm_pol.F90 @@ -15,7 +15,7 @@ MODULE TPM_POL ! R. El Khatib 17-Feb-2016 Optional allocation/computation of DDC/DDD/DDE ! since they are (big and) not used in supolf. -USE PARKIND1 ,ONLY : JPRD, JPIM +USE EC_PARKIND ,ONLY : JPRD, JPIM IMPLICIT NONE diff --git a/src/trans/internal/trgtol_mod.F90 b/src/trans/internal/trgtol_mod.F90 index 6a8060ff..064da76a 100644 --- a/src/trans/internal/trgtol_mod.F90 +++ b/src/trans/internal/trgtol_mod.F90 @@ -415,7 +415,7 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& -USE PARKIND1 ,ONLY : JPIM ,JPRB ,JPIA +USE PARKIND1 ,ONLY : JPIM ,JPRB ,JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_WAITANY, & @@ -472,7 +472,7 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& INTEGER(KIND=JPIM) :: ISEND, ITAG, JBLK, JFLD, JK, JL, IFLD, II, IFLDS, INS, INR INTEGER(KIND=JPIM) :: JJ,JI,IFLDT, J -INTEGER(KIND=JPIA) :: JFLD64 +INTEGER(KIND=JPIB) :: JFLD64 INTEGER(KIND=JPIM) :: IUVLEVS(KF_GP),IUVPARS(KF_GP),IGP2PARS(KF_GP) INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP) diff --git a/src/trans/internal/trltog_mod.F90 b/src/trans/internal/trltog_mod.F90 index 4beee17b..b50f55ed 100644 --- a/src/trans/internal/trltog_mod.F90 +++ b/src/trans/internal/trltog_mod.F90 @@ -445,7 +445,7 @@ SUBROUTINE TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& ! R. El Khatib 09-Sep-2020 64 bits addressing for PGLAT ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB ,JPIA +USE PARKIND1 ,ONLY : JPIM ,JPRB ,JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_WAITANY, & @@ -503,7 +503,7 @@ SUBROUTINE TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& INTEGER(KIND=JPIM) :: ISEND, ITAG, JBLK, JFLD, JK, JL, IFLDS, INR, INS INTEGER(KIND=JPIM) :: II,ILEN, IFLDT, JI, JJ, J -INTEGER(KIND=JPIA) :: JFLD64 +INTEGER(KIND=JPIB) :: JFLD64 LOGICAL :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY LOGICAL :: LLUV(KF_GP),LLGP2(KF_GP),LLGP3A(KF_GP),LLGP3B(KF_GP) diff --git a/src/trans/sharedmem/sharedmem_mod.F90 b/src/trans/sharedmem/sharedmem_mod.F90 index 16bb0fca..bb28a489 100644 --- a/src/trans/sharedmem/sharedmem_mod.F90 +++ b/src/trans/sharedmem/sharedmem_mod.F90 @@ -17,7 +17,6 @@ MODULE SHAREDMEM_MOD USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT, C_NULL_PTR,C_SIZE_T -USE PARKIND1 ,ONLY : JPIM, JPRB ,JPRD #ifdef __NEC__ #define C_SIZEOF(x) INT(KIND(x),C_SIZE_T) @@ -187,7 +186,7 @@ SUBROUTINE SHAREDMEM_ASSOCIATE1_INT32(HANDLE,SIZE,FPTR,ADVANCE) USE, INTRINSIC :: ISO_C_BINDING TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE INTEGER(C_INT), INTENT(IN) :: SIZE - INTEGER(KIND=JPIM), POINTER, INTENT(INOUT) :: FPTR(:) + INTEGER(KIND=C_INT), POINTER, INTENT(INOUT) :: FPTR(:) LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE INTEGER(C_INT) :: K From 41eeac60c63570ade604aa45c6495e1de89ef6df Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Wed, 31 Jan 2024 13:42:29 +0100 Subject: [PATCH 02/48] Generate libraries with unique symbols Co-authored-by: Zbigniew Piotrowski --- CMakeLists.txt | 5 + src/trans/CMakeLists.txt | 226 ++++++++++++++++++++++++++++++++------- src/trans/sedrenames.txt | 166 ++++++++++++++++++++++++++++ tests/CMakeLists.txt | 4 +- 4 files changed, 363 insertions(+), 38 deletions(-) create mode 100644 src/trans/sedrenames.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 89c513c3..15dabc14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,11 @@ # nor does it submit to any jurisdiction. cmake_minimum_required( VERSION 3.12 FATAL_ERROR ) + +# Once we update the minimum required version to 3.20, +# we can use the command `cmake_path` instead of `get_filename_component`. +# For discussion, see https://github.com/ecmwf-ifs/ectrans/pull/61#discussion_r1494362092 + find_package( ecbuild 3.4 REQUIRED HINTS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../ecbuild ) project( ectrans LANGUAGES C Fortran ) diff --git a/src/trans/CMakeLists.txt b/src/trans/CMakeLists.txt index 62c94714..c4157ea5 100644 --- a/src/trans/CMakeLists.txt +++ b/src/trans/CMakeLists.txt @@ -32,41 +32,193 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES "Intel") endif() endif() -## Assemble sources - -ecbuild_list_add_pattern( LIST trans_src - GLOB - sharedmem/* - algor/* - internal/* - external/* - ${CMAKE_CURRENT_BINARY_DIR}/internal/ectrans_version_mod.F90 - QUIET - ) - -if( NOT HAVE_FFTW ) - ecbuild_list_exclude_pattern( LIST trans_src REGEX tpm_fftw.F90 ) +## Sources which are precision independent can go into a common library +list( APPEND ectrans_common_src + sharedmem/sharedmem_mod.F90 + sharedmem/sharedmem.c + internal/abort_trans_mod.F90 + internal/cpledn_mod.F90 + internal/gawl_mod.F90 + internal/sugaw_mod.F90 + internal/supol_mod.F90 + internal/supolf_mod.F90 + internal/tpm_constants.F90 + internal/tpm_ctl.F90 + internal/tpm_dim.F90 + internal/tpm_gen.F90 + internal/tpm_geometry.F90 + internal/tpm_pol.F90 + external/get_current.F90 + external/sugawc.F90 + ${CMAKE_CURRENT_BINARY_DIR}/internal/ectrans_version_mod.F90 +) +list( APPEND ectrans_common_includes + include/ectrans/sugawc.h + include/ectrans/get_current.h +) + +ecbuild_add_library( + TARGET ectrans_common + LINKER_LANGUAGE Fortran + SOURCES ${ectrans_common_src} + PUBLIC_LIBS fiat +) +ectrans_target_fortran_module_directory( + TARGET ectrans_common + MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/module/ectrans + INSTALL_DIRECTORY module/ectrans +) + +if( HAVE_OMP ) + ecbuild_debug("target_link_libraries( trans_${prec} PRIVATE OpenMP::OpenMP_Fortran )") + target_link_libraries( ectrans_common PRIVATE OpenMP::OpenMP_Fortran ) endif() + + +function(generate_file) + set (options) + set (oneValueArgs INPUT OUTPUT BACKEND) + set (multiValueArgs) + cmake_parse_arguments(_PAR "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(output ${_PAR_OUTPUT}) + set(input ${_PAR_INPUT}) + set(backend ${_PAR_BACKEND}) + + set( JPRB_dp JPRD ) + set( jprb_dp jprd ) + set( JPRB_sp JPRM ) + set( jprb_sp jprm ) + + add_custom_command( + OUTPUT ${output} + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/sedrenames.txt | + sed -e "s/VARIANTDESIGNATOR/${backend}/g" | + sed -e "s/TYPEDESIGNATOR_UPPER/${JPRB_${backend}}/g" | + sed -e "s/TYPEDESIGNATOR_LOWER/${jprb_${backend}}/g" | + sed -rf - ${CMAKE_CURRENT_SOURCE_DIR}/${input} > ${output} + DEPENDS ${input} ${CMAKE_CURRENT_SOURCE_DIR}/sedrenames.txt + COMMENT "Generating ${output}" + VERBATIM + ) +endfunction(generate_file) + + +function(generate_backend_includes) + set (options) + set (oneValueArgs BACKEND TARGET DESTINATION) + set (multiValueArgs) + cmake_parse_arguments(_PAR "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(destination ${_PAR_DESTINATION} ) + set(backend ${_PAR_BACKEND}) + + file(MAKE_DIRECTORY ${destination}) + file(MAKE_DIRECTORY ${destination}/${backend}) + + ecbuild_list_add_pattern( LIST files GLOB include/ectrans/*.h QUIET ) + + set( outfiles ) + foreach(file_i ${files}) + get_filename_component(outfile_name ${file_i} NAME) + get_filename_component(outfile_name_we ${file_i} NAME_WE) + get_filename_component(outfile_ext ${file_i} EXT) + get_filename_component(outfile_dir ${file_i} DIRECTORY) + if (${file_i} IN_LIST ectrans_common_includes) + configure_file(${file_i} ${destination}/${outfile_name}) + else() + set(outfile "${destination}/${outfile_name_we}_${backend}${outfile_ext}") + ecbuild_debug("Generate ${outfile}") + generate_file(BACKEND ${backend} INPUT ${file_i} OUTPUT ${outfile}) + list(APPEND outfiles ${outfile}) + string(TOUPPER ${outfile_name_we} OUTFILE_NAME_WE ) + ecbuild_debug("Generate ${destination}/trans_${backend}/${outfile_name}") + file(WRITE ${destination}/trans_${backend}/${outfile_name} "! Automatically generated interface header for backward compatibility of generic symbols !\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#if defined(${outfile_name_we})\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#undef ${outfile_name_we}\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#endif\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#if defined(${OUTFILE_NAME_WE})\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#undef ${OUTFILE_NAME_WE}\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#endif\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#include \"${outfile_name_we}_${backend}${outfile_ext}\"\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#define ${outfile_name_we} ${OUTFILE_NAME_WE}_${backend}\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#define ${OUTFILE_NAME_WE} ${OUTFILE_NAME_WE}_${backend}\n") + endif() + endforeach(file_i) + + add_custom_target(${_PAR_TARGET}_generate DEPENDS ${outfiles}) + ecbuild_add_library(TARGET ${_PAR_TARGET} TYPE INTERFACE) + add_dependencies(${_PAR_TARGET} ${_PAR_TARGET}_generate) + target_include_directories(${_PAR_TARGET} INTERFACE $) +endfunction(generate_backend_includes) + + + +function(generate_backend_sources) + set (options) + set (oneValueArgs BACKEND DESTINATION OUTPUT) + set (multiValueArgs) + + cmake_parse_arguments(_PAR "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(backend ${_PAR_BACKEND}) + set(destination ${_PAR_DESTINATION}) + file(MAKE_DIRECTORY ${destination}/algor) + file(MAKE_DIRECTORY ${destination}/internal) + file(MAKE_DIRECTORY ${destination}/external) + + ecbuild_list_add_pattern( LIST files + GLOB + algor/* + internal/* + external/* + QUIET + ) + if( NOT HAVE_FFTW ) + ecbuild_list_exclude_pattern( LIST files REGEX tpm_fftw.F90 ) + endif() + + + set(outfiles) + foreach(file_i ${files}) + if(NOT (${file_i} IN_LIST ectrans_common_src)) + get_filename_component(outfile_name ${file_i} NAME) + get_filename_component(outfile_name_we ${file_i} NAME_WE) + get_filename_component(outfile_ext ${file_i} EXT) + get_filename_component(outfile_dir ${file_i} DIRECTORY) + set(outfile "${destination}/${file_i}") + ecbuild_debug("Generate ${outfile}") + generate_file(BACKEND ${backend} INPUT ${file_i} OUTPUT ${outfile}) + list(APPEND outfiles ${outfile}) + endif() + endforeach(file_i) + set(${_PAR_OUTPUT} ${outfiles} PARENT_SCOPE) +endfunction(generate_backend_sources) + +set( BUILD_INTERFACE_INCLUDE_DIR ${CMAKE_BINARY_DIR}/include/ectrans ) + foreach( prec dp sp ) if( HAVE_${prec} ) + generate_backend_includes(BACKEND ${prec} TARGET ectrans_${prec}_includes DESTINATION ${BUILD_INTERFACE_INCLUDE_DIR} ) + generate_backend_sources( BACKEND ${prec} OUTPUT ectrans_${prec}_src DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/generated/ectrans_${prec}) ecbuild_add_library( - TARGET trans_${prec} + TARGET ectrans_${prec} LINKER_LANGUAGE Fortran - SOURCES ${trans_src} - PUBLIC_INCLUDES $ - $ - $ + SOURCES ${ectrans_${prec}_src} + PUBLIC_INCLUDES $ $ - PUBLIC_LIBS fiat parkind_${prec} + $ + PUBLIC_LIBS ectrans_common ectrans_${prec}_includes ) + ectrans_target_fortran_module_directory( - TARGET trans_${prec} - MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/module/trans_${prec} - INSTALL_DIRECTORY module/trans_${prec} + TARGET ectrans_${prec} + MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/module/ectrans_${prec} + INSTALL_DIRECTORY module/ectrans_${prec} ) - target_link_libraries( trans_${prec} PUBLIC fiat parkind_${prec} ) + target_link_libraries( ectrans_${prec} PUBLIC fiat) + if( HAVE_FFTW ) set( FFTW_LINK PRIVATE ) if( LAPACK_${prec} MATCHES "mkl" AND NOT FFTW_LIBRARIES MATCHES "mkl" ) @@ -75,24 +227,24 @@ foreach( prec dp sp ) set( FFTW_LINK PUBLIC ) # Attempt anyway to give FFTW precedence endif() ecbuild_debug("target_link_libraries( trans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} )") - target_link_libraries( trans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} ) - target_include_directories( trans_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) - target_compile_definitions( trans_${prec} PRIVATE WITH_FFTW ) + target_link_libraries( ectrans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} ) + target_include_directories( ectrans_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) + target_compile_definitions( ectrans_${prec} PRIVATE WITH_FFTW ) endif() - ecbuild_debug("target_link_libraries( trans_${prec} ${LAPACK_LINK} ${LAPACK_${prec}} )") - target_link_libraries( trans_${prec} ${LAPACK_LINK} ${LAPACK_${prec}} ) + ecbuild_debug("target_link_libraries( ectrans_${prec} ${LAPACK_LINK} ${LAPACK_${prec}} )") + target_link_libraries( ectrans_${prec} ${LAPACK_LINK} ${LAPACK_${prec}} ) if( HAVE_OMP ) - ecbuild_debug("target_link_libraries( trans_${prec} PRIVATE OpenMP::OpenMP_Fortran )") - target_link_libraries( trans_${prec} PRIVATE OpenMP::OpenMP_Fortran ) + ecbuild_debug("target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran )") + target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran ) endif() + # This interface library is for backward compatibility, and provides the older includes + ecbuild_add_library( TARGET trans_${prec} TYPE INTERFACE ) + target_include_directories( trans_${prec} INTERFACE $ ) + target_include_directories( trans_${prec} INTERFACE $ ) + target_link_libraries( trans_${prec} INTERFACE fiat ectrans_${prec} parkind_${prec}) endif() endforeach() ## Install trans interface - -file( GLOB trans_interface include/ectrans/* ) -install( - FILES ${trans_interface} - DESTINATION include/ectrans -) +install( DIRECTORY ${BUILD_INTERFACE_INCLUDE_DIR}/ DESTINATION include/ectrans ) diff --git a/src/trans/sedrenames.txt b/src/trans/sedrenames.txt new file mode 100644 index 00000000..73004147 --- /dev/null +++ b/src/trans/sedrenames.txt @@ -0,0 +1,166 @@ +s/ FFT992_CC/ FFT992_CC_VARIANTDESIGNATOR/g +s/ FFT992( *($|\(| |\*))/ FFT992_VARIANTDESIGNATOR\1/g +s/ASRE1_MOD/ASRE1_MOD_VARIANTDESIGNATOR/g +s/ASRE1AD_MOD/ASRE1AD_MOD_VARIANTDESIGNATOR/g +s/ASRE1B_MOD/ASRE1B_MOD_VARIANTDESIGNATOR/g +s/ASRE1BAD_MOD/ASRE1BAD_MOD_VARIANTDESIGNATOR/g +s/BLUESTEIN_MOD/BLUESTEIN_MOD_VARIANTDESIGNATOR/g +s/BUTTERFLY_ALG_MOD/BUTTERFLY_ALG_MOD_VARIANTDESIGNATOR/g +s/CDMAP_MOD/CDMAP_MOD_VARIANTDESIGNATOR/g +s/CPLEDN_MOD/CPLEDN_MOD_VARIANTDESIGNATOR/g +s/DEALLOC_RESOL_MOD/DEALLOC_RESOL_MOD_VARIANTDESIGNATOR/g +s/DIR_TRANS_CTL_MOD/DIR_TRANS_CTL_MOD_VARIANTDESIGNATOR/g +s/DIR_TRANS_CTLAD_MOD/DIR_TRANS_CTLAD_MOD_VARIANTDESIGNATOR/g +s/dir_trans( *($|\(| |\*))/dir_trans_VARIANTDESIGNATOR\1/g +s/DIR_TRANS( *($|\(| |\*))/DIR_TRANS_VARIANTDESIGNATOR\1/g +s/dir_transad( *($|\(| |\*))/dir_transad_VARIANTDESIGNATOR\1/g +s/DIR_TRANSAD( *($|\(| |\*))/DIR_TRANSAD_VARIANTDESIGNATOR\1/g +s/DIST_GRID_32_CTL_MOD/DIST_GRID_32_CTL_MOD_VARIANTDESIGNATOR/g +s/dist_grid_32( *($|\(| |\*))/dist_grid_32_VARIANTDESIGNATOR\1/g +s/DIST_GRID_32( *($|\(| |\*))/DIST_GRID_32_VARIANTDESIGNATOR\1/g +s/DIST_GRID_CTL_MOD/DIST_GRID_CTL_MOD_VARIANTDESIGNATOR/g +s/dist_grid( *($|\(| |\*))/dist_grid_VARIANTDESIGNATOR\1/g +s/DIST_GRID( *($|\(| |\*))/DIST_GRID_VARIANTDESIGNATOR\1/g +s/DIST_SPEC_CONTROL_MOD/DIST_SPEC_CONTROL_MOD_VARIANTDESIGNATOR/g +s/dist_spec( *($|\(| |\*))/dist_spec_VARIANTDESIGNATOR\1/g +s/DIST_SPEC( *($|\(| |\*))/DIST_SPEC_VARIANTDESIGNATOR\1/g +s/ectrans_mod/ectrans_mod_VARIANTDESIGNATOR/g +s/eq_regions_mod/eq_regions_mod_VARIANTDESIGNATOR/g +s/EQ_REGIONS_MOD/EQ_REGIONS_MOD_VARIANTDESIGNATOR/g +s/FFTB_PLAN/FFTB_PLAN_VARIANTDESIGNATOR/g +s/FFTB_TYPE/FFTB_TYPE_VARIANTDESIGNATOR/g +s/FIELD_SPLIT_MOD/FIELD_SPLIT_MOD_VARIANTDESIGNATOR/g +s/FOURIER_IN_MOD/FOURIER_IN_MOD_VARIANTDESIGNATOR/g +s/FOURIER_INAD_MOD/FOURIER_INAD_MOD_VARIANTDESIGNATOR/g +s/FOURIER_OUT_MOD/FOURIER_OUT_MOD_VARIANTDESIGNATOR/g +s/FOURIER_OUTAD_MOD/FOURIER_OUTAD_MOD_VARIANTDESIGNATOR/g +s/FSC_MOD/FSC_MOD_VARIANTDESIGNATOR/g +s/FSCAD_MOD/FSCAD_MOD_VARIANTDESIGNATOR/g +s/FSPGL_INT_MOD/FSPGL_INT_MOD_VARIANTDESIGNATOR/g +s/FTDIR_CTL_MOD/FTDIR_CTL_MOD_VARIANTDESIGNATOR/g +s/FTDIR_CTLAD_MOD/FTDIR_CTLAD_MOD_VARIANTDESIGNATOR/g +s/FTDIR_MOD/FTDIR_MOD_VARIANTDESIGNATOR/g +s/FTDIRAD_MOD/FTDIRAD_MOD_VARIANTDESIGNATOR/g +s/FTINV_CTL_MOD/FTINV_CTL_MOD_VARIANTDESIGNATOR/g +s/FTINV_CTLAD_MOD/FTINV_CTLAD_MOD_VARIANTDESIGNATOR/g +s/FTINV_MOD/FTINV_MOD_VARIANTDESIGNATOR/g +s/FTINVAD_MOD/FTINVAD_MOD_VARIANTDESIGNATOR/g +s/GATH_GRID_32_CTL_MOD/GATH_GRID_32_CTL_MOD_VARIANTDESIGNATOR/g +s/gath_grid_32( *($|\(| |\*))/gath_grid_32_VARIANTDESIGNATOR\1/g +s/GATH_GRID_32( *($|\(| |\*))/GATH_GRID_32_VARIANTDESIGNATOR\1/g +s/GATH_GRID_CTL_MOD/GATH_GRID_CTL_MOD_VARIANTDESIGNATOR/g +s/gath_grid( *($|\(| |\*))/gath_grid_VARIANTDESIGNATOR\1/g +s/GATH_GRID( *($|\(| |\*))/GATH_GRID_VARIANTDESIGNATOR\1/g +s/GATH_SPEC_CONTROL_MOD/GATH_SPEC_CONTROL_MOD_VARIANTDESIGNATOR/g +s/gath_spec( *($|\(| |\*))/gath_spec_VARIANTDESIGNATOR\1/g +s/GATH_SPEC( *($|\(| |\*))/GATH_SPEC_VARIANTDESIGNATOR\1/g +s/GPNORM_TRANS_CTL_MOD/GPNORM_TRANS_CTL_MOD_VARIANTDESIGNATOR/g +s/gpnorm_trans( *($|\(| |\*))/gpnorm_trans_VARIANTDESIGNATOR\1/g +s/GPNORM_TRANS( *($|\(| |\*))/GPNORM_TRANS_VARIANTDESIGNATOR\1/g +s/ini_spec_dist( *($|\(| |\*))/ini_spec_dist_VARIANTDESIGNATOR\1/g +s/INI_SPEC_DIST/INI_SPEC_DIST_VARIANTDESIGNATOR/g +s/INIGPTR_MOD/INIGPTR_MOD_VARIANTDESIGNATOR/g +s/INTERPOL_DECOMP_MOD/INTERPOL_DECOMP_MOD_VARIANTDESIGNATOR/g +s/INV_TRANS_CTL_MOD/INV_TRANS_CTL_MOD_VARIANTDESIGNATOR/g +s/INV_TRANS_CTLAD_MOD/INV_TRANS_CTLAD_MOD_VARIANTDESIGNATOR/g +s/inv_trans( *($|\(| |\*))/inv_trans_VARIANTDESIGNATOR\1/g +s/INV_TRANS( *($|\(| |\*))/INV_TRANS_VARIANTDESIGNATOR\1/g +s/inv_transad( *($|\(| |\*))/inv_transad_VARIANTDESIGNATOR\1/g +s/INV_TRANSAD/INV_TRANSAD_VARIANTDESIGNATOR/g +s/jprb/TYPEDESIGNATOR_LOWER/g +s/JPRB/TYPEDESIGNATOR_UPPER/g +s/JPRH/JPRD/g +s/LDFOU2_MOD/LDFOU2_MOD_VARIANTDESIGNATOR/g +s/LDFOU2AD_MOD/LDFOU2AD_MOD_VARIANTDESIGNATOR/g +s/LEDIR_MOD/LEDIR_MOD_VARIANTDESIGNATOR/g +s/LEDIRAD_MOD/LEDIRAD_MOD_VARIANTDESIGNATOR/g +s/LEINV_MOD/LEINV_MOD_VARIANTDESIGNATOR/g +s/LEINVAD_MOD/LEINVAD_MOD_VARIANTDESIGNATOR/g +s/LTDIR_CTL_MOD/LTDIR_CTL_MOD_VARIANTDESIGNATOR/g +s/LTDIR_CTLAD_MOD/LTDIR_CTLAD_MOD_VARIANTDESIGNATOR/g +s/LTDIR_MOD/LTDIR_MOD_VARIANTDESIGNATOR/g +s/LTDIRAD_MOD/LTDIRAD_MOD_VARIANTDESIGNATOR/g +s/LTINV_CTL_MOD/LTINV_CTL_MOD_VARIANTDESIGNATOR/g +s/LTINV_CTLAD_MOD/LTINV_CTLAD_MOD_VARIANTDESIGNATOR/g +s/LTINV_MOD/LTINV_MOD_VARIANTDESIGNATOR/g +s/LTINVAD_MOD/LTINVAD_MOD_VARIANTDESIGNATOR/g +s/MYRECVSET_MOD/MYRECVSET_MOD_VARIANTDESIGNATOR/g +s/MYSENDSET_MOD/MYSENDSET_MOD_VARIANTDESIGNATOR/g +s/parkind1/ec_parkind/g +s/PARKIND1/EC_PARKIND/g +s/PARKIND2/EC_PARKIND/g +s/PE2SET_MOD/PE2SET_MOD_VARIANTDESIGNATOR/g +s/PRE_SULEG_MOD/PRE_SULEG_MOD_VARIANTDESIGNATOR/g +s/PREPSNM_MOD/PREPSNM_MOD_VARIANTDESIGNATOR/g +s/PRFI1_MOD/PRFI1_MOD_VARIANTDESIGNATOR/g +s/PRFI1AD_MOD/PRFI1AD_MOD_VARIANTDESIGNATOR/g +s/PRFI1B_MOD/PRFI1B_MOD_VARIANTDESIGNATOR/g +s/PRFI1BAD_MOD/PRFI1BAD_MOD_VARIANTDESIGNATOR/g +s/PRFI2_MOD/PRFI2_MOD_VARIANTDESIGNATOR/g +s/PRFI2AD_MOD/PRFI2AD_MOD_VARIANTDESIGNATOR/g +s/PRFI2B_MOD/PRFI2B_MOD_VARIANTDESIGNATOR/g +s/PRFI2BAD_MOD/PRFI2BAD_MOD_VARIANTDESIGNATOR/g +s/READ_LEGPOL_MOD/READ_LEGPOL_MOD_VARIANTDESIGNATOR/g +s/seefmm_mix/seefmm_mix_VARIANTDESIGNATOR/g +s/SEEFMM_MIX/SEEFMM_MIX_VARIANTDESIGNATOR/g +s/SET_RESOL_MOD/SET_RESOL_MOD_VARIANTDESIGNATOR/g +s/SET2PE_MOD/SET2PE_MOD_VARIANTDESIGNATOR/g +s/SET99( *($|\(| |\*))/SET99_VARIANTDESIGNATOR\1/g +s/SET99B/SET99B_VARIANTDESIGNATOR/g +s/SETUP_DIMS_MOD/SETUP_DIMS_MOD_VARIANTDESIGNATOR/g +s/SETUP_GEOM_MOD/SETUP_GEOM_MOD_VARIANTDESIGNATOR/g +s/SETUP_TRANS( *($|\(| |\*))/SETUP_TRANS_VARIANTDESIGNATOR\1/g +s/setup_trans( *($|\(| |\*|\.h))/setup_trans_VARIANTDESIGNATOR\1/g +s/setup_trans0( *($|\(| |\*|\.h))/setup_trans0_VARIANTDESIGNATOR\1/g +s/SETUP_TRANS0/SETUP_TRANS0_VARIANTDESIGNATOR/g +s/SHUFFLE_MOD/SHUFFLE_MOD_VARIANTDESIGNATOR/g +s/specnorm/specnorm_VARIANTDESIGNATOR/g +s/SPECNORM/SPECNORM_VARIANTDESIGNATOR/g +s/SPNORM_CTL_MOD/SPNORM_CTL_MOD_VARIANTDESIGNATOR/g +s/SPNORMC_MOD/SPNORMC_MOD_VARIANTDESIGNATOR/g +s/SPNORMD_MOD/SPNORMD_MOD_VARIANTDESIGNATOR/g +s/SPNSDE_MOD/SPNSDE_MOD_VARIANTDESIGNATOR/g +s/SPNSDEAD_MOD/SPNSDEAD_MOD_VARIANTDESIGNATOR/g +s/SUFFT_MOD/SUFFT_MOD_VARIANTDESIGNATOR/g +s/SULEG_MOD/SULEG_MOD_VARIANTDESIGNATOR/g +s/SUMP_TRANS_MOD/SUMP_TRANS_MOD_VARIANTDESIGNATOR/g +s/SUMP_TRANS_PRELEG_MOD/SUMP_TRANS_PRELEG_MOD_VARIANTDESIGNATOR/g +s/SUMP_TRANS0_MOD/SUMP_TRANS0_MOD_VARIANTDESIGNATOR/g +s/SUMPLAT_MOD/SUMPLAT_MOD_VARIANTDESIGNATOR/g +s/SUMPLATB_MOD/SUMPLATB_MOD_VARIANTDESIGNATOR/g +s/SUMPLATBEQ_MOD/SUMPLATBEQ_MOD_VARIANTDESIGNATOR/g +s/SUMPLATF_MOD/SUMPLATF_MOD_VARIANTDESIGNATOR/g +s/SUSTAONL_MOD/SUSTAONL_MOD_VARIANTDESIGNATOR/g +s/SUTRLE_MOD/SUTRLE_MOD_VARIANTDESIGNATOR/g +s/SUWAVEDI_MOD/SUWAVEDI_MOD_VARIANTDESIGNATOR/g +s/TPM_DISTR/TPM_DISTR_VARIANTDESIGNATOR/g +s/TPM_FFT( *(,|$| ))/TPM_FFT_VARIANTDESIGNATOR\1/g +s/TPM_FFTW/TPM_FFTW_VARIANTDESIGNATOR/g +s/TPM_FIELDS/TPM_FIELDS_VARIANTDESIGNATOR/g +s/TPM_FLT/TPM_FLT_VARIANTDESIGNATOR/g +s/TPM_TRANS/TPM_TRANS_VARIANTDESIGNATOR/g +s/trans_end( *($|\(| |\*|\.h))/trans_end_VARIANTDESIGNATOR\1/g +s/TRANS_END/TRANS_END_VARIANTDESIGNATOR/g +s/trans_inq( *($|\(| |\*))/trans_inq_VARIANTDESIGNATOR\1/g +s/TRANS_INQ/TRANS_INQ_VARIANTDESIGNATOR/g +s/TRANS_PNM/TRANS_PNM_VARIANTDESIGNATOR/g +s/trans_release( *($|\(| |\*|\.h))/trans_release_VARIANTDESIGNATOR\1/g +s/TRANS_RELEASE/TRANS_RELEASE_VARIANTDESIGNATOR/g +s/TRGTOL_MOD/TRGTOL_MOD_VARIANTDESIGNATOR/g +s/TRLTOG_MOD/TRLTOG_MOD_VARIANTDESIGNATOR/g +s/TRLTOM_MOD/TRLTOM_MOD_VARIANTDESIGNATOR/g +s/TRMTOL_MOD/TRMTOL_MOD_VARIANTDESIGNATOR/g +s/UPDSP_MOD/UPDSP_MOD_VARIANTDESIGNATOR/g +s/UPDSPAD_MOD/UPDSPAD_MOD_VARIANTDESIGNATOR/g +s/UPDSPB_MOD/UPDSPB_MOD_VARIANTDESIGNATOR/g +s/UPDSPBAD_MOD/UPDSPBAD_MOD_VARIANTDESIGNATOR/g +s/UVTVD_MOD/UVTVD_MOD_VARIANTDESIGNATOR/g +s/UVTVDAD_MOD/UVTVDAD_MOD_VARIANTDESIGNATOR/g +s/VD2UV_CTL_MOD/VD2UV_CTL_MOD_VARIANTDESIGNATOR/g +s/VD2UV_MOD/VD2UV_MOD_VARIANTDESIGNATOR/g +s/VDTUV_MOD/VDTUV_MOD_VARIANTDESIGNATOR/g +s/VDTUVAD_MOD/VDTUVAD_MOD_VARIANTDESIGNATOR/g +s/VORDIV_TO_UV/VORDIV_TO_UV_VARIANTDESIGNATOR/g +s/WRITE_LEGPOL_MOD/WRITE_LEGPOL_MOD_VARIANTDESIGNATOR/g +s/wts500_mod/wts500_mod_VARIANTDESIGNATOR/g +s/WTS500_MOD/WTS500_MOD_VARIANTDESIGNATOR/g diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8dbb16f6..bf06ec83 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -33,13 +33,15 @@ endif() if( HAVE_DOUBLE_PRECISION ) set( trans trans_dp ) + set( parkind parkind_dp ) else() set( trans trans_sp ) + set( parkind parkind_sp ) endif() ecbuild_add_test(TARGET ectrans_test_adjoint SOURCES trans/test_adjoint.F90 - LIBS ${trans} + LIBS ${trans} ${parkind} LINKER_LANGUAGE Fortran ) if( TEST ectrans_test_adjoint AND HAVE_OMP ) From 8bd218792a135af2735e412c0c8df775fce661f0 Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Wed, 7 Feb 2024 17:24:40 +0100 Subject: [PATCH 03/48] Remove sugawc from compilation --- src/trans/CMakeLists.txt | 2 -- src/trans/{ => maybe_unused}/external/sugawc.F90 | 0 src/trans/{ => maybe_unused}/include/ectrans/sugawc.h | 0 3 files changed, 2 deletions(-) rename src/trans/{ => maybe_unused}/external/sugawc.F90 (100%) rename src/trans/{ => maybe_unused}/include/ectrans/sugawc.h (100%) diff --git a/src/trans/CMakeLists.txt b/src/trans/CMakeLists.txt index c4157ea5..5462e924 100644 --- a/src/trans/CMakeLists.txt +++ b/src/trans/CMakeLists.txt @@ -49,11 +49,9 @@ list( APPEND ectrans_common_src internal/tpm_geometry.F90 internal/tpm_pol.F90 external/get_current.F90 - external/sugawc.F90 ${CMAKE_CURRENT_BINARY_DIR}/internal/ectrans_version_mod.F90 ) list( APPEND ectrans_common_includes - include/ectrans/sugawc.h include/ectrans/get_current.h ) diff --git a/src/trans/external/sugawc.F90 b/src/trans/maybe_unused/external/sugawc.F90 similarity index 100% rename from src/trans/external/sugawc.F90 rename to src/trans/maybe_unused/external/sugawc.F90 diff --git a/src/trans/include/ectrans/sugawc.h b/src/trans/maybe_unused/include/ectrans/sugawc.h similarity index 100% rename from src/trans/include/ectrans/sugawc.h rename to src/trans/maybe_unused/include/ectrans/sugawc.h From 8941cbfa476adfd8255f37013efb068ff5a5154a Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Thu, 8 Feb 2024 10:36:46 +0100 Subject: [PATCH 04/48] Make some internal modules and routines precision independent --- src/trans/external/setup_trans0.F90 | 6 +- src/trans/internal/eq_regions_mod.F90 | 124 +++++++++---------- src/trans/internal/myrecvset_mod.F90 | 2 +- src/trans/internal/mysendset_mod.F90 | 2 +- src/trans/internal/pe2set_mod.F90 | 2 +- src/trans/internal/pre_suleg_mod.F90 | 1 - src/trans/internal/set2pe_mod.F90 | 2 +- src/trans/internal/sump_trans0_mod.F90 | 2 +- src/trans/internal/sump_trans_mod.F90 | 6 +- src/trans/internal/sump_trans_preleg_mod.F90 | 2 +- src/trans/internal/sumplat_mod.F90 | 6 +- src/trans/internal/sumplatb_mod.F90 | 6 +- src/trans/internal/sumplatbeq_mod.F90 | 8 +- src/trans/internal/sumplatf_mod.F90 | 2 +- src/trans/internal/sustaonl_mod.F90 | 22 ++-- src/trans/internal/sutrle_mod.F90 | 2 +- src/trans/internal/suwavedi_mod.F90 | 2 +- src/trans/internal/tpm_distr.F90 | 4 +- src/trans/internal/vd2uv_mod.F90 | 2 +- 19 files changed, 101 insertions(+), 102 deletions(-) diff --git a/src/trans/external/setup_trans0.F90 b/src/trans/external/setup_trans0.F90 index 8d557d71..cbc22baf 100644 --- a/src/trans/external/setup_trans0.F90 +++ b/src/trans/external/setup_trans0.F90 @@ -68,7 +68,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM ,JPRB ,JPRD !ifndef INTERFACE @@ -126,7 +126,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& LSYNC_TRANS=.FALSE. NTRANS_SYNC_LEVEL=0 LEQ_REGIONS=.FALSE. -RA=6371229._JPRB +RA=6371229._JPRD LALLOPERM=.FALSE. NSTACK_MEMORY_TR=0 @@ -209,7 +209,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& ENDIF IF(PRESENT(PRAD)) THEN - RA=PRAD + RA=REAL(PRAD,JPRD) ENDIF IF(PRESENT(LDALLOPERM)) THEN diff --git a/src/trans/internal/eq_regions_mod.F90 b/src/trans/internal/eq_regions_mod.F90 index c01c3fc9..7c704f6c 100644 --- a/src/trans/internal/eq_regions_mod.F90 +++ b/src/trans/internal/eq_regions_mod.F90 @@ -70,7 +70,7 @@ MODULE eq_regions_mod ! !-------------------------------------------------------------------------------- ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE @@ -81,7 +81,7 @@ MODULE eq_regions_mod PUBLIC eq_regions,l_regions_debug,n_regions_ns,n_regions_ew,n_regions,my_region_ns,my_region_ew PUBLIC eq_regions_t, eq_regions_save, eq_regions_load, eq_regions_free -real(kind=jprb) pi +real(kind=jprd) :: pi type eq_regions_t logical :: l_regions_debug=.false. @@ -144,14 +144,14 @@ subroutine eq_regions(N) ! eq_regions uses the zonal equal area sphere partitioning algorithm to partition ! the surface of a sphere into N regions of equal area and small diameter. ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE integer(kind=jpim),intent(in) :: N integer(kind=jpim) :: n_collars,j -real(kind=jprb),allocatable :: r_regions(:) -real(kind=jprb) :: c_polar +real(kind=jprd),allocatable :: r_regions(:) +real(kind=jprd) :: c_polar -pi=2.0_jprb*asin(1.0_jprb) +pi=2.0_jprd*asin(1.0_jprd) n_regions(:)=0 @@ -223,10 +223,10 @@ function num_collars(N,c_polar,a_ideal) result(num_c) ! Given N, an ideal angle, and c_polar, ! determine n_collars, the number of collars between the polar caps. ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE integer(kind=jpim),intent(in) :: N -real(kind=jprb),intent(in) :: a_ideal,c_polar +real(kind=jprd),intent(in) :: a_ideal,c_polar integer(kind=jpim) :: num_c logical enough enough = (N > 2) .and. (a_ideal > 0) @@ -251,22 +251,22 @@ subroutine ideal_region_list(N,c_polar,n_collars,r_regions) ! r_regions[n_collars+2] is 1. ! The sum of r_regions is N. ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE integer(kind=jpim),intent(in) :: N,n_collars -real(kind=jprb),intent(in) :: c_polar -real(kind=jprb),intent(out) :: r_regions(n_collars+2) +real(kind=jprd),intent(in) :: c_polar +real(kind=jprd),intent(out) :: r_regions(n_collars+2) integer(kind=jpim) :: collar_n -real(kind=jprb) :: ideal_region_area,ideal_collar_area -real(kind=jprb) :: a_fitting -r_regions(:)=0.0_jprb -r_regions(1) = 1.0_jprb +real(kind=jprd) :: ideal_region_area,ideal_collar_area +real(kind=jprd) :: a_fitting +r_regions(:)=0.0_jprd +r_regions(1) = 1.0_jprd if( n_collars > 0 )then ! ! Based on n_collars and c_polar, determine a_fitting, ! the collar angle such that n_collars collars fit between the polar caps. ! - a_fitting = (pi-2.0_jprb*c_polar)/float(n_collars) + a_fitting = (pi-2.0_jprd*c_polar)/real(n_collars,jprd) ideal_region_area = area_of_ideal_region(N) do collar_n=1,n_collars ideal_collar_area = area_of_collar(c_polar+(collar_n-1)*a_fitting, & @@ -285,11 +285,11 @@ function ideal_collar_angle(N) result(ideal) ! IDEAL_COLLAR_ANGLE(N) sets ANGLE to the ideal angle for the ! spherical collars of an EQ partition of the unit sphere S^2 into N regions. ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE integer(kind=jpim),intent(in) :: N -real(kind=jprb) :: ideal -ideal = area_of_ideal_region(N)**(0.5_jprb) +real(kind=jprd) :: ideal +ideal = area_of_ideal_region(N)**(0.5_jprd) return end function ideal_collar_angle @@ -305,17 +305,17 @@ subroutine round_to_naturals(N,n_collars,r_regions) ! n_regions[n_collars+2] is 1. ! The sum of n_regions is N. ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE integer(kind=jpim),intent(in) :: N,n_collars -real(kind=jprb),intent(in) :: r_regions(n_collars+2) +real(kind=jprd),intent(in) :: r_regions(n_collars+2) integer(kind=jpim) :: zone_n -real(kind=jprb) :: discrepancy +real(kind=jprd) :: discrepancy n_regions(1:n_collars+2) = r_regions(:) -discrepancy = 0.0_jprb +discrepancy = 0.0_jprd do zone_n = 1,n_collars+2 n_regions(zone_n) = nint(r_regions(zone_n)+discrepancy); - discrepancy = discrepancy+r_regions(zone_n)-float(n_regions(zone_n)); + discrepancy = discrepancy+r_regions(zone_n)-real(n_regions(zone_n),jprd); enddo return end subroutine round_to_naturals @@ -324,13 +324,13 @@ function polar_colat(N) result(polar_c) ! ! Given N, determine the colatitude of the North polar spherical cap. ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE integer(kind=jpim),intent(in) :: N -real(kind=jprb) :: area -real(kind=jprb) :: polar_c +real(kind=jprd) :: area +real(kind=jprd) :: polar_c if( N == 1 ) polar_c=pi -if( N == 2 ) polar_c=pi/2.0_jprb +if( N == 2 ) polar_c=pi/2.0_jprd if( N > 2 )then area=area_of_ideal_region(N) polar_c=sradius_of_cap(area) @@ -343,13 +343,13 @@ function area_of_ideal_region(N) result(area) ! AREA_OF_IDEAL_REGION(N) sets AREA to be the area of one of N equal ! area regions on S^2, that is 1/N times AREA_OF_SPHERE. ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE integer(kind=jpim),intent(in) :: N -real(kind=jprb) :: area_of_sphere -real(kind=jprb) :: area -area_of_sphere = (2.0_jprb*pi**1.5_jprb/gamma(1.5_jprb)) -area = area_of_sphere/float(N) +real(kind=jprd) :: area_of_sphere +real(kind=jprd) :: area +area_of_sphere = (2.0_jprd*pi**1.5_jprd/gamma(1.5_jprd)) +area = area_of_sphere/real(N,jprd) return end function area_of_ideal_region @@ -358,11 +358,11 @@ function sradius_of_cap(area) result(sradius) ! SRADIUS_OF_CAP(AREA) returns the spherical radius of ! an S^2 spherical cap of area AREA. ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE -real(kind=jprb),intent(in) :: area -real(kind=jprb) :: sradius -sradius = 2.0_jprb*asin(sqrt(area/pi)/2.0_jprb) +real(kind=jprd),intent(in) :: area +real(kind=jprd) :: sradius +sradius = 2.0_jprd*asin(sqrt(area/pi)/2.0_jprd) return end function sradius_of_cap @@ -374,10 +374,10 @@ function area_of_collar(a_top, a_bot) result(area) ! collar specified by A_TOP, A_BOT, where A_TOP is top (smaller) spherical radius, ! A_BOT is bottom (larger) spherical radius. ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE -real(kind=jprb),intent(in) :: a_top,a_bot -real(kind=jprb) area +real(kind=jprd),intent(in) :: a_top,a_bot +real(kind=jprd) area area = area_of_cap(a_bot) - area_of_cap(a_top) return end function area_of_collar @@ -389,37 +389,37 @@ function area_of_cap(s_cap) result(area) ! AREA_OF_CAP(S_CAP) sets AREA to be the area of an S^2 spherical ! cap of spherical radius S_CAP. ! -real(kind=jprb),intent(in) :: s_cap -real(kind=jprb) area -area = 4.0_jprb*pi * sin(s_cap/2.0_jprb)**2 +real(kind=jprd),intent(in) :: s_cap +real(kind=jprd) area +area = 4.0_jprd*pi * sin(s_cap/2.0_jprd)**2 return end function area_of_cap function gamma(x) result(gamma_res) ! -USE PARKIND1 ,ONLY : JPIM, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPRD IMPLICIT NONE -real(kind=jprb),intent(in) :: x -real(kind=jprb) :: gamma_res -real(kind=jprb) :: p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13 -real(kind=jprb) :: w,y +real(kind=jprd),intent(in) :: x +real(kind=jprd) :: gamma_res +real(kind=jprd) :: p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13 +real(kind=jprd) :: w,y integer(kind=jpim) :: k,n parameter (& -& p0 = 0.999999999999999990e+00_jprb,& -& p1 = -0.422784335098466784e+00_jprb,& -& p2 = -0.233093736421782878e+00_jprb,& -& p3 = 0.191091101387638410e+00_jprb,& -& p4 = -0.024552490005641278e+00_jprb,& -& p5 = -0.017645244547851414e+00_jprb,& -& p6 = 0.008023273027855346e+00_jprb) +& p0 = 0.999999999999999990e+00_jprd,& +& p1 = -0.422784335098466784e+00_jprd,& +& p2 = -0.233093736421782878e+00_jprd,& +& p3 = 0.191091101387638410e+00_jprd,& +& p4 = -0.024552490005641278e+00_jprd,& +& p5 = -0.017645244547851414e+00_jprd,& +& p6 = 0.008023273027855346e+00_jprd) parameter (& -& p7 = -0.000804329819255744e+00_jprb,& -& p8 = -0.000360837876648255e+00_jprb,& -& p9 = 0.000145596568617526e+00_jprb,& -& p10 = -0.000017545539395205e+00_jprb,& -& p11 = -0.000002591225267689e+00_jprb,& -& p12 = 0.000001337767384067e+00_jprb,& -& p13 = -0.000000199542863674e+00_jprb) +& p7 = -0.000804329819255744e+00_jprd,& +& p8 = -0.000360837876648255e+00_jprd,& +& p9 = 0.000145596568617526e+00_jprd,& +& p10 = -0.000017545539395205e+00_jprd,& +& p11 = -0.000002591225267689e+00_jprd,& +& p12 = 0.000001337767384067e+00_jprd,& +& p13 = -0.000000199542863674e+00_jprd) n = nint(x - 2) w = x - (n + 2) y = ((((((((((((p13 * w + p12) * w + p11) * w + p10) *& diff --git a/src/trans/internal/myrecvset_mod.F90 b/src/trans/internal/myrecvset_mod.F90 index 948b6754..14638a7a 100644 --- a/src/trans/internal/myrecvset_mod.F90 +++ b/src/trans/internal/myrecvset_mod.F90 @@ -52,7 +52,7 @@ FUNCTION MYRECVSET(KSETS,KMYSET,KSET) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM +USE EC_PARKIND ,ONLY : JPIM USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS ! diff --git a/src/trans/internal/mysendset_mod.F90 b/src/trans/internal/mysendset_mod.F90 index 86c02bd5..fa33a1a1 100644 --- a/src/trans/internal/mysendset_mod.F90 +++ b/src/trans/internal/mysendset_mod.F90 @@ -50,7 +50,7 @@ FUNCTION MYSENDSET(KSETS,KMYSET,KSET) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM +USE EC_PARKIND ,ONLY : JPIM USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS IMPLICIT NONE diff --git a/src/trans/internal/pe2set_mod.F90 b/src/trans/internal/pe2set_mod.F90 index 2197ef06..ecda01f5 100644 --- a/src/trans/internal/pe2set_mod.F90 +++ b/src/trans/internal/pe2set_mod.F90 @@ -70,7 +70,7 @@ SUBROUTINE PE2SET(KPE,KPRGPNS,KPRGPEW,KPRTRW,KPRTRV) ! Revision : 98-10-13 row ordering ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM +USE EC_PARKIND ,ONLY : JPIM USE TPM_DISTR ,ONLY : LEQ_REGIONS, NPRGPEW, NPROC, NPRTRV USE EQ_REGIONS_MOD ,ONLY : N_REGIONS, N_REGIONS_NS diff --git a/src/trans/internal/pre_suleg_mod.F90 b/src/trans/internal/pre_suleg_mod.F90 index 8d661a86..02434109 100644 --- a/src/trans/internal/pre_suleg_mod.F90 +++ b/src/trans/internal/pre_suleg_mod.F90 @@ -13,7 +13,6 @@ MODULE PRE_SULEG_MOD CONTAINS SUBROUTINE PRE_SULEG USE PARKIND1 ,ONLY : JPRD, JPIM -USE PARKIND2 ,ONLY : JPRH USE TPM_GEN ,ONLY : NPRINTLEV,NOUT USE TPM_DIM ,ONLY : R USE TPM_CONSTANTS ,ONLY: RA diff --git a/src/trans/internal/set2pe_mod.F90 b/src/trans/internal/set2pe_mod.F90 index 595d5c62..804f1288 100644 --- a/src/trans/internal/set2pe_mod.F90 +++ b/src/trans/internal/set2pe_mod.F90 @@ -67,7 +67,7 @@ SUBROUTINE SET2PE(KPE,KPRGPNS,KPRGPEW,KPRTRW,KPRTRV) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM +USE EC_PARKIND ,ONLY : JPIM USE TPM_DISTR ,ONLY : LEQ_REGIONS, NPRGPEW, NPRGPNS, NPRTRV, NPRTRW USE EQ_REGIONS_MOD ,ONLY : N_REGIONS, N_REGIONS_NS diff --git a/src/trans/internal/sump_trans0_mod.F90 b/src/trans/internal/sump_trans0_mod.F90 index 54ed0825..0468de59 100644 --- a/src/trans/internal/sump_trans0_mod.F90 +++ b/src/trans/internal/sump_trans0_mod.F90 @@ -14,7 +14,7 @@ SUBROUTINE SUMP_TRANS0 ! Set up distributed environment for the transform package (part 0) -USE PARKIND1 ,ONLY : JPIM +USE EC_PARKIND ,ONLY : JPIM USE MPL_MODULE ,ONLY : MPL_GROUPS_CREATE, MPL_MYRANK, MPL_NPROC USE TPM_GEN ,ONLY : NOUT, LMPOFF, NPRINTLEV diff --git a/src/trans/internal/sump_trans_mod.F90 b/src/trans/internal/sump_trans_mod.F90 index 86d5051e..7526556c 100644 --- a/src/trans/internal/sump_trans_mod.F90 +++ b/src/trans/internal/sump_trans_mod.F90 @@ -17,7 +17,7 @@ SUBROUTINE SUMP_TRANS ! Modifications : ! P.Marguinaud : 11-Sep-2012 : Fix twice allocated pointer -USE PARKIND1 ,ONLY : JPIM ,JPRB, JPRD +USE PARKIND1 ,ONLY : JPIM ,JPRD USE TPM_GEN ,ONLY : NOUT, NPRINTLEV USE TPM_DIM ,ONLY : R @@ -41,8 +41,8 @@ SUBROUTINE SUMP_TRANS INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:) -REAL(KIND=JPRB),ALLOCATABLE :: ZDUM(:) -REAL(KIND=JPRB) :: ZMEDIAP +REAL(KIND=JPRD),ALLOCATABLE :: ZDUM(:) +REAL(KIND=JPRD) :: ZMEDIAP LOGICAL :: LLP1,LLP2 diff --git a/src/trans/internal/sump_trans_preleg_mod.F90 b/src/trans/internal/sump_trans_preleg_mod.F90 index c02a86be..49b996fb 100644 --- a/src/trans/internal/sump_trans_preleg_mod.F90 +++ b/src/trans/internal/sump_trans_preleg_mod.F90 @@ -14,7 +14,7 @@ SUBROUTINE SUMP_TRANS_PRELEG ! Set up distributed environment for the transform package (part 1) -USE PARKIND1 ,ONLY : JPIM +USE EC_PARKIND ,ONLY : JPIM USE TPM_GEN ,ONLY : NOUT, NPRINTLEV USE TPM_DIM ,ONLY : R diff --git a/src/trans/internal/sumplat_mod.F90 b/src/trans/internal/sumplat_mod.F90 index 88bccf5d..1182af91 100644 --- a/src/trans/internal/sumplat_mod.F90 +++ b/src/trans/internal/sumplat_mod.F90 @@ -88,7 +88,7 @@ SUBROUTINE SUMPLAT(KDGL,KPROC,KPROCA,KMYSETA,LDSPLIT,LDEQ_REGIONS,& ! not lelam features in new routine sumplatb.F ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE EC_PARKIND ,ONLY : JPIM ,JPRD USE TPM_GEOMETRY ,ONLY : G @@ -101,14 +101,14 @@ SUBROUTINE SUMPLAT(KDGL,KPROC,KPROCA,KMYSETA,LDSPLIT,LDEQ_REGIONS,& ! * DUMMY: -REAL(KIND=JPRB),INTENT(OUT) :: PMEDIAP +REAL(KIND=JPRD),INTENT(OUT) :: PMEDIAP INTEGER(KIND=JPIM),INTENT(OUT) :: KMEDIAP INTEGER(KIND=JPIM),INTENT(OUT) :: KRESTM INTEGER(KIND=JPIM),INTENT(IN) :: KDGL INTEGER(KIND=JPIM),INTENT(IN) :: KPROC INTEGER(KIND=JPIM),INTENT(IN) :: KPROCA INTEGER(KIND=JPIM),INTENT(IN) :: KMYSETA -REAL(KIND=JPRB),INTENT(IN) :: PWEIGHT(:) +REAL(KIND=JPRD),INTENT(IN) :: PWEIGHT(:) LOGICAL,INTENT(INOUT) :: LDWEIGHTED_DISTR INTEGER(KIND=JPIM),INTENT(OUT) :: KFRSTLAT(:) INTEGER(KIND=JPIM),INTENT(OUT) :: KLSTLAT(:) diff --git a/src/trans/internal/sumplatb_mod.F90 b/src/trans/internal/sumplatb_mod.F90 index 3633dafc..eb44cd29 100644 --- a/src/trans/internal/sumplatb_mod.F90 +++ b/src/trans/internal/sumplatb_mod.F90 @@ -66,7 +66,7 @@ SUBROUTINE SUMPLATB(KDGSA,KDGL,KPROCA,KLOENG,LDSPLIT,LDFOURIER,& ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM, JPIB, JPRB +USE EC_PARKIND ,ONLY : JPIM, JPIB, JPRD IMPLICIT NONE @@ -92,7 +92,7 @@ SUBROUTINE SUMPLATB(KDGSA,KDGL,KPROCA,KLOENG,LDSPLIT,LDFOURIER,& INTEGER(KIND=JPIM) :: ITOT_TOP, ITOT_BOT, IGL_TOP, IGL_BOT INTEGER(KIND=JPIB) :: IMEDIA,ITOT -!REAL(KIND=JPRB) :: ZLG +!REAL(KIND=JPRD) :: ZLG LOGICAL :: LLDONE,LLSIMPLE ! ----------------------------------------------------------------- @@ -105,7 +105,7 @@ SUBROUTINE SUMPLATB(KDGSA,KDGL,KPROCA,KLOENG,LDSPLIT,LDFOURIER,& IF( LDFOURIER )THEN ! DO JGL=1,KDGL -! ZLG=LOG(FLOAT(KLOENG(JGL))) +! ZLG=LOG(REAL(KLOENG(JGL),JPRD)) ! ICOST(JGL)=KLOENG(JGL)*ZLG*SQRT(ZLG) ! ENDDO diff --git a/src/trans/internal/sumplatbeq_mod.F90 b/src/trans/internal/sumplatbeq_mod.F90 index e9ba66ad..d69e2d04 100644 --- a/src/trans/internal/sumplatbeq_mod.F90 +++ b/src/trans/internal/sumplatbeq_mod.F90 @@ -72,7 +72,7 @@ SUBROUTINE SUMPLATBEQ(KDGSA,KDGL,KPROC,KPROCA,KLOENG,LDSPLIT,LDEQ_REGIONS,& ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE EC_PARKIND ,ONLY : JPIM ,JPRD USE TPM_DISTR ,ONLY : MYPROC USE EQ_REGIONS_MOD ,ONLY : N_REGIONS @@ -88,11 +88,11 @@ SUBROUTINE SUMPLATBEQ(KDGSA,KDGL,KPROC,KPROCA,KLOENG,LDSPLIT,LDEQ_REGIONS,& INTEGER(KIND=JPIM),INTENT(IN) :: KPROC INTEGER(KIND=JPIM),INTENT(IN) :: KPROCA INTEGER(KIND=JPIM),INTENT(IN) :: KLOENG(KDGSA:KDGL) -REAL(KIND=JPRB),INTENT(IN) :: PWEIGHT(:) +REAL(KIND=JPRD),INTENT(IN) :: PWEIGHT(:) LOGICAL,INTENT(IN) :: LDSPLIT LOGICAL,INTENT(IN) :: LDEQ_REGIONS LOGICAL,INTENT(INOUT) :: LDWEIGHTED_DISTR -REAL(KIND=JPRB),INTENT(OUT) :: PMEDIAP +REAL(KIND=JPRD),INTENT(OUT) :: PMEDIAP INTEGER(KIND=JPIM),INTENT(OUT) :: KMEDIAP INTEGER(KIND=JPIM),INTENT(OUT) :: KRESTM INTEGER(KIND=JPIM),INTENT(OUT) :: KINDIC(KPROCA) @@ -104,7 +104,7 @@ SUBROUTINE SUMPLATBEQ(KDGSA,KDGL,KPROC,KPROCA,KLOENG,LDSPLIT,LDEQ_REGIONS,& ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: ICOMP, IGL, IMAXI, IMEDIA, IMEDIAP, ITOT, JA, JB, IA, JGL,& &ILAST,IREST,IPE,I2REGIONS,IGP -REAL(KIND=JPRB) :: ZMEDIA, ZCOMP +REAL(KIND=JPRD) :: ZMEDIA, ZCOMP LOGICAL :: LLDONE ! ----------------------------------------------------------------- diff --git a/src/trans/internal/sumplatf_mod.F90 b/src/trans/internal/sumplatf_mod.F90 index 3de226e4..33e41823 100644 --- a/src/trans/internal/sumplatf_mod.F90 +++ b/src/trans/internal/sumplatf_mod.F90 @@ -75,7 +75,7 @@ SUBROUTINE SUMPLATF(KDGL,KPROCA,KMYSETA,& ! not lelam features in new routine sumplatb.F ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM +USE EC_PARKIND ,ONLY : JPIM USE TPM_GEOMETRY ,ONLY : G diff --git a/src/trans/internal/sustaonl_mod.F90 b/src/trans/internal/sustaonl_mod.F90 index f9ed25ce..39977dd1 100644 --- a/src/trans/internal/sustaonl_mod.F90 +++ b/src/trans/internal/sustaonl_mod.F90 @@ -64,8 +64,8 @@ SUBROUTINE SUSTAONL(KMEDIAP,KRESTM,LDWEIGHTED_DISTR,PWEIGHT,PMEDIAP,KPROCAGP) ! R. El Khatib 26-Apr-2018 vectorization ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB, JPRD -USE MPL_MODULE ,ONLY : MPL_ALLGATHERV, MPL_RECV, MPL_SEND +USE EC_PARKIND ,ONLY : JPIM, JPRD +USE MPL_MODULE ,ONLY : MPL_ALLGATHERV, MPL_RECV, MPL_SEND USE TPM_GEN ,ONLY : NOUT, NPRINTLEV USE TPM_DIM ,ONLY : R @@ -83,9 +83,9 @@ SUBROUTINE SUSTAONL(KMEDIAP,KRESTM,LDWEIGHTED_DISTR,PWEIGHT,PMEDIAP,KPROCAGP) ! DUMMY INTEGER(KIND=JPIM),INTENT(IN) :: KMEDIAP INTEGER(KIND=JPIM),INTENT(IN) :: KRESTM -REAL(KIND=JPRB),INTENT(IN) :: PWEIGHT(:) +REAL(KIND=JPRD),INTENT(IN) :: PWEIGHT(:) LOGICAL,INTENT(IN) :: LDWEIGHTED_DISTR -REAL(KIND=JPRB),INTENT(IN) :: PMEDIAP +REAL(KIND=JPRD),INTENT(IN) :: PMEDIAP INTEGER(KIND=JPIM),INTENT(IN) :: KPROCAGP(:) ! LOCAL @@ -100,19 +100,19 @@ SUBROUTINE SUSTAONL(KMEDIAP,KRESTM,LDWEIGHTED_DISTR,PWEIGHT,PMEDIAP,KPROCAGP) &IREST, ISEND, ITAG, JA, JB, JGL, JL, JNPTSRE, & &ILAT, ILON, ILOEN INTEGER(KIND=JPIM),ALLOCATABLE :: ICOMBUFG(:) -REAL(KIND=JPRB),ALLOCATABLE :: ZWEIGHT(:,:) +REAL(KIND=JPRD),ALLOCATABLE :: ZWEIGHT(:,:) INTEGER(KIND=JPIM) :: JJ, ILENG(NPROC), IOFF(NPROC) LOGICAL :: LLABORT LOGICAL :: LLP1,LLP2 -REAL(KIND=JPRB) :: ZDIVID(R%NDGL) -REAL(KIND=JPRB) :: ZCOMP,ZPI +REAL(KIND=JPRD) :: ZDIVID(R%NDGL) +REAL(KIND=JPRD) :: ZCOMP,ZPI INTEGER(KIND=JPIM) :: ILATMD,ILATMD1 ! ----------------------------------------------------------------- -ZPI = 2.0_JPRB*ASIN(1.0_JPRB) +ZPI = 2.0_JPRD*ASIN(1.0_JPRD) IXPTLAT (:)=999999 ILSTPTLAT(:)=999999 @@ -176,7 +176,7 @@ SUBROUTINE SUSTAONL(KMEDIAP,KRESTM,LDWEIGHTED_DISTR,PWEIGHT,PMEDIAP,KPROCAGP) ! --------------------------------------- IF( NPROC > 1 )THEN DO JGL=1,ILEN - ZDIVID(JGL) = 360000.0_JPRB/REAL(G%NLOEN(D%NFRSTLAT(MY_REGION_NS)+JGL-1),JPRB) + ZDIVID(JGL) = 360000.0_JPRD/REAL(G%NLOEN(D%NFRSTLAT(MY_REGION_NS)+JGL-1),JPRD) ENDDO IF( LDWEIGHTED_DISTR )THEN ALLOCATE(ZWEIGHT(G%NLOEN(R%NDGL/2),R%NDGL)) @@ -205,7 +205,7 @@ SUBROUTINE SUSTAONL(KMEDIAP,KRESTM,LDWEIGHTED_DISTR,PWEIGHT,PMEDIAP,KPROCAGP) ILATMD = 360000 DO JGL=1,ILEN IF (IXPTLAT(JGL) <= ILSTPTLAT(JGL)) THEN - ILATMD1 = NINT(REAL(IXPTLAT(JGL)-1,JPRB)*ZDIVID(JGL)) + ILATMD1 = NINT(REAL(IXPTLAT(JGL)-1,JPRD)*ZDIVID(JGL)) IF(ILATMD1 < ILATMD) THEN ILATMD = ILATMD1 INXLAT = JGL @@ -232,7 +232,7 @@ SUBROUTINE SUSTAONL(KMEDIAP,KRESTM,LDWEIGHTED_DISTR,PWEIGHT,PMEDIAP,KPROCAGP) ILATMD = 360000 DO JGL=1,ILEN IF (IXPTLAT(JGL) <= ILSTPTLAT(JGL)) THEN - ILATMD1 = NINT(REAL(IXPTLAT(JGL)-1,JPRB)*ZDIVID(JGL)) + ILATMD1 = NINT(REAL(IXPTLAT(JGL)-1,JPRD)*ZDIVID(JGL)) IF(ILATMD1 < ILATMD) THEN ILATMD = ILATMD1 INXLAT = JGL diff --git a/src/trans/internal/sutrle_mod.F90 b/src/trans/internal/sutrle_mod.F90 index 42660a56..3ceefed1 100644 --- a/src/trans/internal/sutrle_mod.F90 +++ b/src/trans/internal/sutrle_mod.F90 @@ -52,7 +52,7 @@ SUBROUTINE SUTRLE(PNM,KGL,KLOOP) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPRD, JPIM +USE EC_PARKIND ,ONLY : JPRD, JPIM USE MPL_MODULE ,ONLY : MPL_ALLREDUCE, MPL_RECV, MPL_SEND, MPL_BARRIER, MPL_WAIT, & & JP_NON_BLOCKING_STANDARD diff --git a/src/trans/internal/suwavedi_mod.F90 b/src/trans/internal/suwavedi_mod.F90 index 5aad2cde..6995b2ee 100644 --- a/src/trans/internal/suwavedi_mod.F90 +++ b/src/trans/internal/suwavedi_mod.F90 @@ -68,7 +68,7 @@ SUBROUTINE SUWAVEDI(KSMAX,KTMAX,KPRTRW,KMYSETW,KASM0,KSPOLEGL,KPROCM,& ! K.YESSAD : 97-02-18 - Add KTMAX, bug correction for KSPOLEGL. ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM +USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE diff --git a/src/trans/internal/tpm_distr.F90 b/src/trans/internal/tpm_distr.F90 index 8912797c..d9640baa 100644 --- a/src/trans/internal/tpm_distr.F90 +++ b/src/trans/internal/tpm_distr.F90 @@ -12,7 +12,7 @@ MODULE TPM_DISTR ! Module for distributed memory environment. -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE EC_PARKIND ,ONLY : JPIM ,JPRD IMPLICIT NONE @@ -157,7 +157,7 @@ MODULE TPM_DISTR INTEGER(KIND=JPIM) :: NGPTOTMX ! Maximum number of grid columns on any of the PEs INTEGER(KIND=JPIM) ,ALLOCATABLE :: NGPTOTL(:,:) ! Number of grid columns on each PE. -REAL(KIND=JPRB) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) +REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set END TYPE DISTR_TYPE diff --git a/src/trans/internal/vd2uv_mod.F90 b/src/trans/internal/vd2uv_mod.F90 index 00e4cda8..474c169e 100644 --- a/src/trans/internal/vd2uv_mod.F90 +++ b/src/trans/internal/vd2uv_mod.F90 @@ -130,7 +130,7 @@ SUBROUTINE VD2UV(KM,KMLOC,KF_UV,KLEI2,PSPVOR,PSPDIV,PU,PV) & ZIA(:,IUL:IUU),ZIA(:,IVL:IVU)) ILCM = R%NSMAX+1-KM IOFF = D%NASM0(KM) - ZA_R = 1.0_JPRB/RA + ZA_R = 1.0_JPRB/REAL(RA,JPRB) DO J=1,ILCM INM = IOFF+(ILCM-J)*2 DO JFLD=1,KF_UV From e4cb32a77f6413af9116cea6b843b6b7bb6fca00 Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Thu, 8 Feb 2024 10:37:45 +0100 Subject: [PATCH 05/48] Add the new precision independent routines to ectrans_common --- src/trans/CMakeLists.txt | 14 ++++++++++++++ src/trans/sedrenames.txt | 17 ----------------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/trans/CMakeLists.txt b/src/trans/CMakeLists.txt index 5462e924..34d5be94 100644 --- a/src/trans/CMakeLists.txt +++ b/src/trans/CMakeLists.txt @@ -48,6 +48,20 @@ list( APPEND ectrans_common_src internal/tpm_gen.F90 internal/tpm_geometry.F90 internal/tpm_pol.F90 + internal/tpm_distr.F90 + internal/pe2set_mod.F90 + internal/set2pe_mod.F90 + internal/eq_regions_mod.F90 + internal/sump_trans0_mod.F90 + internal/sustaonl_mod.F90 + internal/sumplat_mod.F90 + internal/sumplatb_mod.F90 + internal/sumplatbeq_mod.F90 + internal/sumplatf_mod.F90 + internal/mysendset_mod.F90 + internal/myrecvset_mod.F90 + internal/suwavedi_mod.F90 + internal/sump_trans_preleg_mod.F90 external/get_current.F90 ${CMAKE_CURRENT_BINARY_DIR}/internal/ectrans_version_mod.F90 ) diff --git a/src/trans/sedrenames.txt b/src/trans/sedrenames.txt index 73004147..5354cdcb 100644 --- a/src/trans/sedrenames.txt +++ b/src/trans/sedrenames.txt @@ -7,7 +7,6 @@ s/ASRE1BAD_MOD/ASRE1BAD_MOD_VARIANTDESIGNATOR/g s/BLUESTEIN_MOD/BLUESTEIN_MOD_VARIANTDESIGNATOR/g s/BUTTERFLY_ALG_MOD/BUTTERFLY_ALG_MOD_VARIANTDESIGNATOR/g s/CDMAP_MOD/CDMAP_MOD_VARIANTDESIGNATOR/g -s/CPLEDN_MOD/CPLEDN_MOD_VARIANTDESIGNATOR/g s/DEALLOC_RESOL_MOD/DEALLOC_RESOL_MOD_VARIANTDESIGNATOR/g s/DIR_TRANS_CTL_MOD/DIR_TRANS_CTL_MOD_VARIANTDESIGNATOR/g s/DIR_TRANS_CTLAD_MOD/DIR_TRANS_CTLAD_MOD_VARIANTDESIGNATOR/g @@ -25,8 +24,6 @@ s/DIST_SPEC_CONTROL_MOD/DIST_SPEC_CONTROL_MOD_VARIANTDESIGNATOR/g s/dist_spec( *($|\(| |\*))/dist_spec_VARIANTDESIGNATOR\1/g s/DIST_SPEC( *($|\(| |\*))/DIST_SPEC_VARIANTDESIGNATOR\1/g s/ectrans_mod/ectrans_mod_VARIANTDESIGNATOR/g -s/eq_regions_mod/eq_regions_mod_VARIANTDESIGNATOR/g -s/EQ_REGIONS_MOD/EQ_REGIONS_MOD_VARIANTDESIGNATOR/g s/FFTB_PLAN/FFTB_PLAN_VARIANTDESIGNATOR/g s/FFTB_TYPE/FFTB_TYPE_VARIANTDESIGNATOR/g s/FIELD_SPLIT_MOD/FIELD_SPLIT_MOD_VARIANTDESIGNATOR/g @@ -84,12 +81,9 @@ s/LTINV_CTL_MOD/LTINV_CTL_MOD_VARIANTDESIGNATOR/g s/LTINV_CTLAD_MOD/LTINV_CTLAD_MOD_VARIANTDESIGNATOR/g s/LTINV_MOD/LTINV_MOD_VARIANTDESIGNATOR/g s/LTINVAD_MOD/LTINVAD_MOD_VARIANTDESIGNATOR/g -s/MYRECVSET_MOD/MYRECVSET_MOD_VARIANTDESIGNATOR/g -s/MYSENDSET_MOD/MYSENDSET_MOD_VARIANTDESIGNATOR/g s/parkind1/ec_parkind/g s/PARKIND1/EC_PARKIND/g s/PARKIND2/EC_PARKIND/g -s/PE2SET_MOD/PE2SET_MOD_VARIANTDESIGNATOR/g s/PRE_SULEG_MOD/PRE_SULEG_MOD_VARIANTDESIGNATOR/g s/PREPSNM_MOD/PREPSNM_MOD_VARIANTDESIGNATOR/g s/PRFI1_MOD/PRFI1_MOD_VARIANTDESIGNATOR/g @@ -104,7 +98,6 @@ s/READ_LEGPOL_MOD/READ_LEGPOL_MOD_VARIANTDESIGNATOR/g s/seefmm_mix/seefmm_mix_VARIANTDESIGNATOR/g s/SEEFMM_MIX/SEEFMM_MIX_VARIANTDESIGNATOR/g s/SET_RESOL_MOD/SET_RESOL_MOD_VARIANTDESIGNATOR/g -s/SET2PE_MOD/SET2PE_MOD_VARIANTDESIGNATOR/g s/SET99( *($|\(| |\*))/SET99_VARIANTDESIGNATOR\1/g s/SET99B/SET99B_VARIANTDESIGNATOR/g s/SETUP_DIMS_MOD/SETUP_DIMS_MOD_VARIANTDESIGNATOR/g @@ -123,17 +116,7 @@ s/SPNSDE_MOD/SPNSDE_MOD_VARIANTDESIGNATOR/g s/SPNSDEAD_MOD/SPNSDEAD_MOD_VARIANTDESIGNATOR/g s/SUFFT_MOD/SUFFT_MOD_VARIANTDESIGNATOR/g s/SULEG_MOD/SULEG_MOD_VARIANTDESIGNATOR/g -s/SUMP_TRANS_MOD/SUMP_TRANS_MOD_VARIANTDESIGNATOR/g -s/SUMP_TRANS_PRELEG_MOD/SUMP_TRANS_PRELEG_MOD_VARIANTDESIGNATOR/g -s/SUMP_TRANS0_MOD/SUMP_TRANS0_MOD_VARIANTDESIGNATOR/g -s/SUMPLAT_MOD/SUMPLAT_MOD_VARIANTDESIGNATOR/g -s/SUMPLATB_MOD/SUMPLATB_MOD_VARIANTDESIGNATOR/g -s/SUMPLATBEQ_MOD/SUMPLATBEQ_MOD_VARIANTDESIGNATOR/g -s/SUMPLATF_MOD/SUMPLATF_MOD_VARIANTDESIGNATOR/g -s/SUSTAONL_MOD/SUSTAONL_MOD_VARIANTDESIGNATOR/g s/SUTRLE_MOD/SUTRLE_MOD_VARIANTDESIGNATOR/g -s/SUWAVEDI_MOD/SUWAVEDI_MOD_VARIANTDESIGNATOR/g -s/TPM_DISTR/TPM_DISTR_VARIANTDESIGNATOR/g s/TPM_FFT( *(,|$| ))/TPM_FFT_VARIANTDESIGNATOR\1/g s/TPM_FFTW/TPM_FFTW_VARIANTDESIGNATOR/g s/TPM_FIELDS/TPM_FIELDS_VARIANTDESIGNATOR/g From 56eb451f0ea82268e3049c205f7eadbed4eaaef1 Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Thu, 8 Feb 2024 11:35:49 +0100 Subject: [PATCH 06/48] setup_trans, setup_trans0 API change, using JPRD kind for arguments --- src/programs/ectrans-benchmark.F90 | 2 +- src/trans/external/setup_trans.F90 | 14 ++++++++------ src/trans/external/setup_trans0.F90 | 6 +++--- src/trans/include/ectrans/setup_trans.h | 9 +++++---- src/trans/include/ectrans/setup_trans0.h | 4 ++-- 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index d7844ec3..ff7dae22 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -144,7 +144,7 @@ program transform_test ! Verbosity level (0 or 1) integer :: verbosity = 0 -real(kind=jprb) :: zra = 6371229._jprb +real(kind=jprd) :: zra = 6371229._jprd integer(kind=jpim) :: nmax_resol = 37 ! Max number of resolutions integer(kind=jpim) :: npromatr = 0 ! nproma for trans lib diff --git a/src/trans/external/setup_trans.F90 b/src/trans/external/setup_trans.F90 index 13132e6f..171c2c98 100644 --- a/src/trans/external/setup_trans.F90 +++ b/src/trans/external/setup_trans.F90 @@ -35,7 +35,9 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! LDSPLIT - true if split latitudes in grid-point space [false] ! KTMAX - truncation order for tendencies? ! KRESOL - the resolution identifier -! PWEIGHT - the weight per grid-point (for a weighted distribution) +! PWEIGHT - the weight per grid-point (for a weighted distribution); +! Note, only seems to be used from within enkf + ! LDGRIDONLY - true if only grid space is required ! KSMAX,KDGL,KTMAX and KLOEN are GLOBAL variables desribing the resolution @@ -96,7 +98,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! R. El Khatib 07-Mar-2016 Better flexibility for Legendre polynomials computation in stretched mode ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB, JPRD +USE EC_PARKIND ,ONLY : JPIM ,JPRD USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT,C_ASSOCIATED,C_SIZE_T !ifndef INTERFACE @@ -137,8 +139,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& LOGICAL ,OPTIONAL,INTENT(IN) :: LDSPLIT INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KTMAX INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT):: KRESOL -REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PWEIGHT(:) -REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSTRET +REAL(KIND=JPRD) ,OPTIONAL,INTENT(IN) :: PWEIGHT(:) +REAL(KIND=JPRD) ,OPTIONAL,INTENT(IN) :: PSTRET LOGICAL ,OPTIONAL,INTENT(IN):: LDGRIDONLY LOGICAL ,OPTIONAL,INTENT(IN):: LDUSEFLT LOGICAL ,OPTIONAL,INTENT(IN):: LDUSERPNM @@ -311,7 +313,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IF(SIZE(PWEIGHT) /= SUM(G%NLOEN(:)) )THEN CALL ABORT_TRANS('SETUP_TRANS:SIZE(PWEIGHT) /= SUM(G%NLOEN(:))') ENDIF - IF( MINVAL(PWEIGHT(:)) < 0.0_JPRB )THEN + IF( MINVAL(PWEIGHT(:)) < 0.0_JPRD )THEN CALL ABORT_TRANS('SETUP_TRANS: INVALID WEIGHTS') ENDIF ALLOCATE(D%RWEIGHT(SIZE(PWEIGHT))) @@ -344,7 +346,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& S%LSOUTHPNM=.FALSE. IF(PRESENT(PSTRET)) THEN - IF (ABS(PSTRET-1.0_JPRB)>100._JPRB*EPSILON(1._JPRB)) THEN + IF (ABS(PSTRET-1.0_JPRD)>100._JPRD*EPSILON(1._JPRD)) THEN G%RSTRET=PSTRET S%LSOUTHPNM=.TRUE. ENDIF diff --git a/src/trans/external/setup_trans0.F90 b/src/trans/external/setup_trans0.F90 index cbc22baf..8f63af35 100644 --- a/src/trans/external/setup_trans0.F90 +++ b/src/trans/external/setup_trans0.F90 @@ -68,7 +68,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB ,JPRD +USE EC_PARKIND ,ONLY : JPIM ,JPRD !ifndef INTERFACE @@ -93,7 +93,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KTRANS_SYNC_LEVEL LOGICAL ,OPTIONAL,INTENT(IN) :: LDEQ_REGIONS LOGICAL ,OPTIONAL,INTENT(IN) :: LDALLOPERM -REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PRAD +REAL(KIND=JPRD) ,OPTIONAL,INTENT(IN) :: PRAD INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KOPT_MEMORY_TR INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT) :: K_REGIONS(:) INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT) :: K_REGIONS_NS @@ -209,7 +209,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& ENDIF IF(PRESENT(PRAD)) THEN - RA=REAL(PRAD,JPRD) + RA=PRAD ENDIF IF(PRESENT(LDALLOPERM)) THEN diff --git a/src/trans/include/ectrans/setup_trans.h b/src/trans/include/ectrans/setup_trans.h index 810e765b..0889e0d4 100644 --- a/src/trans/include/ectrans/setup_trans.h +++ b/src/trans/include/ectrans/setup_trans.h @@ -36,7 +36,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! LDSPLIT - true if split latitudes in grid-point space [false] ! KTMAX - truncation order for tendencies? ! KRESOL - the resolution identifier -! PWEIGHT - the weight per grid-point (for a weighted distribution) +! PWEIGHT - the weight per grid-point (for a weighted distribution); +! Note, only seems to be used from within enkf ! LDGRIDONLY - true if only grid space is required ! KSMAX,KDGL,KTMAX and KLOEN are GLOBAL variables desribing the resolution @@ -78,7 +79,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM ,JPRD USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT,C_ASSOCIATED,C_SIZE_T @@ -92,8 +93,8 @@ INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KLOEN(:) LOGICAL ,OPTIONAL,INTENT(IN) :: LDSPLIT INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KTMAX INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT):: KRESOL -REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PWEIGHT(:) -REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSTRET +REAL(KIND=JPRD) ,OPTIONAL,INTENT(IN) :: PWEIGHT(:) +REAL(KIND=JPRD) ,OPTIONAL,INTENT(IN) :: PSTRET LOGICAL ,OPTIONAL,INTENT(IN):: LDGRIDONLY LOGICAL ,OPTIONAL,INTENT(IN):: LDUSEFLT LOGICAL ,OPTIONAL,INTENT(IN):: LDUSERPNM diff --git a/src/trans/include/ectrans/setup_trans0.h b/src/trans/include/ectrans/setup_trans0.h index b716f60b..d47d103f 100644 --- a/src/trans/include/ectrans/setup_trans0.h +++ b/src/trans/include/ectrans/setup_trans0.h @@ -69,7 +69,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE EC_PARKIND ,ONLY : JPIM ,JPRD IMPLICIT NONE @@ -80,7 +80,7 @@ LOGICAL ,OPTIONAL,INTENT(IN) :: LDSYNC_TRANS INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KTRANS_SYNC_LEVEL LOGICAL ,OPTIONAL,INTENT(IN) :: LDEQ_REGIONS LOGICAL ,OPTIONAL,INTENT(IN) :: LDALLOPERM -REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PRAD +REAL(KIND=JPRD) ,OPTIONAL,INTENT(IN) :: PRAD INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KOPT_MEMORY_TR INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT) :: K_REGIONS(:) INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT) :: K_REGIONS_NS From d812b8c22a80d4942c575f0474f09eba58eac9aa Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Thu, 8 Feb 2024 11:36:21 +0100 Subject: [PATCH 07/48] Add setup_trans0 to ectrans_common --- src/trans/CMakeLists.txt | 2 ++ src/trans/sedrenames.txt | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/CMakeLists.txt b/src/trans/CMakeLists.txt index 34d5be94..c5a1759f 100644 --- a/src/trans/CMakeLists.txt +++ b/src/trans/CMakeLists.txt @@ -63,10 +63,12 @@ list( APPEND ectrans_common_src internal/suwavedi_mod.F90 internal/sump_trans_preleg_mod.F90 external/get_current.F90 + external/setup_trans0.F90 ${CMAKE_CURRENT_BINARY_DIR}/internal/ectrans_version_mod.F90 ) list( APPEND ectrans_common_includes include/ectrans/get_current.h + include/ectrans/setup_trans0.h ) ecbuild_add_library( diff --git a/src/trans/sedrenames.txt b/src/trans/sedrenames.txt index 5354cdcb..0f8e934b 100644 --- a/src/trans/sedrenames.txt +++ b/src/trans/sedrenames.txt @@ -104,8 +104,6 @@ s/SETUP_DIMS_MOD/SETUP_DIMS_MOD_VARIANTDESIGNATOR/g s/SETUP_GEOM_MOD/SETUP_GEOM_MOD_VARIANTDESIGNATOR/g s/SETUP_TRANS( *($|\(| |\*))/SETUP_TRANS_VARIANTDESIGNATOR\1/g s/setup_trans( *($|\(| |\*|\.h))/setup_trans_VARIANTDESIGNATOR\1/g -s/setup_trans0( *($|\(| |\*|\.h))/setup_trans0_VARIANTDESIGNATOR\1/g -s/SETUP_TRANS0/SETUP_TRANS0_VARIANTDESIGNATOR/g s/SHUFFLE_MOD/SHUFFLE_MOD_VARIANTDESIGNATOR/g s/specnorm/specnorm_VARIANTDESIGNATOR/g s/SPECNORM/SPECNORM_VARIANTDESIGNATOR/g From 7ff6a734d9994c1e6328622bd5abcf9bd8d97098 Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Wed, 28 Feb 2024 13:08:49 +0100 Subject: [PATCH 08/48] Version bump due to API change: 1.4.0-prerelease --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index f0bb29e7..93b33ad5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.3.0 +1.4.0-prerelease From 91a51b1dbca7e0cb5b9ba789f6c2c24be4774d88 Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Thu, 29 Feb 2024 17:14:00 +0100 Subject: [PATCH 09/48] Cleanup #61 --- src/trans/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/CMakeLists.txt b/src/trans/CMakeLists.txt index c5a1759f..4722fea4 100644 --- a/src/trans/CMakeLists.txt +++ b/src/trans/CMakeLists.txt @@ -129,7 +129,7 @@ function(generate_backend_includes) set(backend ${_PAR_BACKEND}) file(MAKE_DIRECTORY ${destination}) - file(MAKE_DIRECTORY ${destination}/${backend}) + file(MAKE_DIRECTORY ${destination}/trans_${backend}) ecbuild_list_add_pattern( LIST files GLOB include/ectrans/*.h QUIET ) From 522a7308771799fad1d993ef7526331a198ad897 Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Wed, 28 Feb 2024 17:26:45 +0100 Subject: [PATCH 10/48] Remove capability of using a different LAPACK for sp and dp --- cmake/ectrans_find_lapack.cmake | 38 --------------------------------- src/trans/CMakeLists.txt | 6 +++--- 2 files changed, 3 insertions(+), 41 deletions(-) diff --git a/cmake/ectrans_find_lapack.cmake b/cmake/ectrans_find_lapack.cmake index 293207a6..ffbc5165 100644 --- a/cmake/ectrans_find_lapack.cmake +++ b/cmake/ectrans_find_lapack.cmake @@ -30,42 +30,4 @@ macro( ectrans_find_lapack ) endif() ecbuild_debug_var( LAPACK_LIBRARIES ) - set( LAPACK_sp ${LAPACK_LIBRARIES} CACHE STRING "ectrans: Double precision LAPACK libraries" ) - set( LAPACK_dp ${LAPACK_LIBRARIES} CACHE STRING "ectrans: Single precision LAPACK libraries" ) - - set( LAPACK_LINK PRIVATE ) - - ### Following is a hack that should be removed when there is no more Cray computer at ECMWF - # It allows to use a different LAPACK library for single and double precision, to be able to - # stay bitreproducible for double precision in operations of CY47R1 - - set( _cray_libsci_loaded $ENV{CRAY_LIBSCI_DIR} ) - - if( CMAKE_Fortran_COMPILER_ID MATCHES "Cray" ) - if( HAVE_MKL AND ECTRANS_CRAYHACK_DOUBLE_PRECISION_WITHOUT_MKL ) - # Following libsci code should disappear soon, with more recent cmake versions (needs more investigation) - if( _cray_libsci_loaded ) - set( _CRAY_PRGENV $ENV{PE_ENV} ) - string( TOLOWER "${_CRAY_PRGENV}" _cray_prgenv ) - set( LAPACK_dp sci_${_cray_prgenv} ) - ecbuild_debug( "LAPACK found, already loaded as part of Cray's libsci" ) - else() - ecbuild_find_package( NAME LAPACK REQUIRED ) - set( LAPACK_dp ${LAPACK_LIBRARIES} ) - if( TARGET lapack ) - ecbuild_debug( "LAPACK found as CMake target lapack" ) - set( LAPACK_dp lapack ) - endif() - endif() - endif() - endif() - - if( _cray_libsci_loaded ) - if( NOT LAPACK_sp MATCHES "sci" OR NOT LAPACK_dp MATCHES "sci" ) - ecbuild_warn( "Danger! Cray's libsci is loaded, which is different from selected LAPACK. " - "No guarantees on link order can be made for the final executable.") - set( LAPACK_LINK PUBLIC ) - endif() - endif() - endmacro() diff --git a/src/trans/CMakeLists.txt b/src/trans/CMakeLists.txt index 4722fea4..9eeae222 100644 --- a/src/trans/CMakeLists.txt +++ b/src/trans/CMakeLists.txt @@ -235,7 +235,7 @@ foreach( prec dp sp ) if( HAVE_FFTW ) set( FFTW_LINK PRIVATE ) - if( LAPACK_${prec} MATCHES "mkl" AND NOT FFTW_LIBRARIES MATCHES "mkl" ) + if( LAPACK_LIBRARIES MATCHES "mkl" AND NOT FFTW_LIBRARIES MATCHES "mkl" ) ecbuild_warn( "Danger: Both MKL and FFTW are linked in trans_${prec}. " "No guarantees on link order can be made for the final executable.") set( FFTW_LINK PUBLIC ) # Attempt anyway to give FFTW precedence @@ -245,8 +245,8 @@ foreach( prec dp sp ) target_include_directories( ectrans_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) target_compile_definitions( ectrans_${prec} PRIVATE WITH_FFTW ) endif() - ecbuild_debug("target_link_libraries( ectrans_${prec} ${LAPACK_LINK} ${LAPACK_${prec}} )") - target_link_libraries( ectrans_${prec} ${LAPACK_LINK} ${LAPACK_${prec}} ) + ecbuild_debug("target_link_libraries( ectrans_${prec} PRIVATE ${LAPACK_LIBRARIES} )") + target_link_libraries( ectrans_${prec} PRIVATE ${LAPACK_LIBRARIES} ) if( HAVE_OMP ) ecbuild_debug("target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran )") target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran ) From 2bcb8373b16dd78a370283871689764afd2b1cd2 Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Wed, 28 Feb 2024 17:27:11 +0100 Subject: [PATCH 11/48] Create BLAS interfaces for GEMM and GEMV --- src/trans/CMakeLists.txt | 2 + src/trans/algor/butterfly_alg_mod.F90 | 167 ++++---------- src/trans/algor/ectrans_blas_mod.F90 | 310 ++++++++++++++++++++++++++ src/trans/internal/ledir_mod.F90 | 32 +-- src/trans/internal/ledirad_mod.F90 | 25 +-- src/trans/internal/leinv_mod.F90 | 42 +--- src/trans/internal/leinvad_mod.F90 | 24 +- 7 files changed, 375 insertions(+), 227 deletions(-) create mode 100644 src/trans/algor/ectrans_blas_mod.F90 diff --git a/src/trans/CMakeLists.txt b/src/trans/CMakeLists.txt index 9eeae222..f12c632a 100644 --- a/src/trans/CMakeLists.txt +++ b/src/trans/CMakeLists.txt @@ -34,6 +34,7 @@ endif() ## Sources which are precision independent can go into a common library list( APPEND ectrans_common_src + algor/ectrans_blas_mod.F90 sharedmem/sharedmem_mod.F90 sharedmem/sharedmem.c internal/abort_trans_mod.F90 @@ -76,6 +77,7 @@ ecbuild_add_library( LINKER_LANGUAGE Fortran SOURCES ${ectrans_common_src} PUBLIC_LIBS fiat + PRIVATE_LIBS ${LAPACK_LIBRARIES} ) ectrans_target_fortran_module_directory( TARGET ectrans_common diff --git a/src/trans/algor/butterfly_alg_mod.F90 b/src/trans/algor/butterfly_alg_mod.F90 index 71bd7f4d..364bd411 100644 --- a/src/trans/algor/butterfly_alg_mod.F90 +++ b/src/trans/algor/butterfly_alg_mod.F90 @@ -12,8 +12,7 @@ MODULE BUTTERFLY_ALG_MOD USE PARKIND1, ONLY : JPRD, JPRM, JPIM, JPRB, JPIB USE INTERPOL_DECOMP_MOD, ONLY : COMPUTE_ID USE SHAREDMEM_MOD, ONLY : SHAREDMEM, SHAREDMEM_ASSOCIATE - -use, intrinsic :: ieee_exceptions +USE ECTRANS_BLAS_MOD, ONLY : GEMM, GEMV IMPLICIT NONE @@ -64,12 +63,6 @@ MODULE BUTTERFLY_ALG_MOD REAL(KIND=JPRB) , ALLOCATABLE :: COMMSBUF(:) ! for communicating packed bufferfly_structs END TYPE CLONE ! between MPI tasks -#ifdef WITH_IEEE_HALT -LOGICAL, PARAMETER :: LL_IEEE_HALT = .TRUE. -#else -LOGICAL, PARAMETER :: LL_IEEE_HALT = .FALSE. -#endif - LOGICAL, PARAMETER :: LLDOUBLE = (JPRB == JPRD) CONTAINS @@ -610,15 +603,9 @@ SUBROUTINE MULT_BUTV(CDTRANS,YD_STRUCT,PVECIN,PVECOUT) IFR = YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IFROW ILR = YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%ILROW IROWS=YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IROWS - IF (LLDOUBLE) THEN - CALL DGEMV('T',IROWS,YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IRANK,& - & 1.0_JPRD,YNODE%B,IROWS,PVECIN(IFR:ILR),1,& - & 0.0_JPRD,ZBETA(IBTST:IBTEN,IBETALV),1) - ELSE - CALL SGEMV('T',IROWS,YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IRANK,& - & 1.0_JPRM,YNODE%B,IROWS,PVECIN(IFR:ILR),1,& - & 0.0_JPRM,ZBETA(IBTST:IBTEN,IBETALV),1) - ENDIF + CALL GEMV('T',IROWS,YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IRANK,& + & 1.0_JPRB,YNODE%B,IROWS,PVECIN(IFR:ILR),1,& + & 0.0_JPRB,ZBETA(IBTST:IBTEN,IBETALV),1) ENDIF ILM1 = JL-1 IBETALVM1=MOD(ILM1,2) @@ -689,15 +676,9 @@ SUBROUTINE MULT_BUTV(CDTRANS,YD_STRUCT,PVECIN,PVECOUT) IFR = YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IFROW ILR = YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%ILROW IROWS = YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IROWS - IF (LLDOUBLE) THEN - CALL DGEMV('N',IROWS,YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IRANK,& - & 1.0_JPRD,YNODE%B,IROWS,ZBETA(IBTST:IBTEN,IBETALV),1,& - & 0.0_JPRD,PVECOUT(IFR:ILR),1) - ELSE - CALL SGEMV('N',IROWS,YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IRANK,& - & 1.0_JPRM,YNODE%B,IROWS,ZBETA(IBTST:IBTEN,IBETALV),1,& - & 0.0_JPRM,PVECOUT(IFR:ILR),1) - ENDIF + CALL GEMV('N',IROWS,YD_STRUCT%SLEV(ILEVS)%NODE(JJ,JK)%IRANK,& + & 1.0_JPRB,YNODE%B,IROWS,ZBETA(IBTST:IBTEN,IBETALV),1,& + & 0.0_JPRB,PVECOUT(IFR:ILR),1) ENDIF ENDDO ENDDO @@ -724,7 +705,6 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) REAL(KIND=JPRB) :: ZVECIN(YD_STRUCT%N_ORDER,KF),ZVECOUT(YD_STRUCT%N_ORDER,KF) REAL(KIND=JPRB),ALLOCATABLE :: ZBETA(:,:,:) LOGICAL :: LLTRANSPOSE -LOGICAL :: LL_HALT_INVALID ! IKWV==0 only, LLTRANSPOSE = true only REAL(KIND=JPRD),ALLOCATABLE :: ZPNONIM_D(:,:) @@ -770,7 +750,7 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) IF( IM <=0 ) CALL ABOR1('mult_butm: IM<=0 not allowed') IF(IN>0) THEN IF (LLDOUBLE.OR.(IKWV == 0)) THEN - IF(.not.LLDOUBLE) THEN + IF(.NOT.LLDOUBLE) THEN ALLOCATE(ZPNONIM_D(IM,IN)) II=0 DO JN=1,IN @@ -780,25 +760,20 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) ENDDO ENDDO ZBETA_D(1:IM,1:KF)=REAL(ZBETA(IBTST:IBTST+IM-1,1:KF,IBETALV),JPRD) - CALL DGEMM('T','N',IN,KF,IM,1.0_JPRD,& - & ZPNONIM_D,IM,ZBETA_D,ILBETA,0.0_JPRD,& - & ZOUT_D,YD_STRUCT%N_ORDER) + CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,& + & ZPNONIM_D(1,1),IM,ZBETA_D(1,1),ILBETA,0.0_JPRD,& + & ZOUT_D(1,1),YD_STRUCT%N_ORDER) ZVECOUT(YNODE%IRANK+1:YNODE%IRANK+IN,1:KF) = REAL(ZOUT_D(1:IN,1:KF),JPRM) DEALLOCATE(ZPNONIM_D) ELSE - CALL DGEMM('T','N',IN,KF,IM,1.0_JPRD,& + CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,& & YNODE%PNONIM(1),IM,ZBETA(IBTST,1,IBETALV),ILBETA,0.0_JPRD,& & ZVECOUT(YNODE%IRANK+1,1),YD_STRUCT%N_ORDER) ENDIF ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('T','N',IN,KF,IM,1.0_JPRM,& + CALL GEMM('T','N',IN,KF,IM,1.0_JPRM,& & YNODE%PNONIM(1),IM,ZBETA(IBTST,1,IBETALV),ILBETA,0.0_JPRM,& & ZVECOUT(YNODE%IRANK+1,1),YD_STRUCT%N_ORDER) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) ENDIF ENDIF DO JF=1,KF @@ -818,12 +793,12 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) IROWS =YNODE%IROWS IRANK = YNODE%IRANK IF (LLDOUBLE.OR.(IKWV == 0)) THEN - IF(.not.LLDOUBLE) THEN + IF(.NOT.LLDOUBLE) THEN ALLOCATE(ZB_D(IROWS,IRANK)) ZB_D(1:IROWS,1:IRANK) = REAL(YNODE%B(1:IROWS,1:IRANK),JPRD) ZIN_D(1:ILR-IFR+1,1:KF) = REAL(PVECIN(IFR:ILR,1:KF),JPRD) - CALL DGEMM('T','N',IRANK,KF,IROWS,1.0_JPRD,& + CALL GEMM('T','N',IRANK,KF,IROWS,1.0_JPRD,& & ZB_D,IROWS,ZIN_D,IRIN,0.0_JPRD,& & ZBETA_D,ILBETA) @@ -831,19 +806,14 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) DEALLOCATE(ZB_D) ELSE - CALL DGEMM('T','N',IRANK,KF,IROWS,1.0_JPRD,& - & YNODE%B,IROWS,PVECIN(IFR,1),IRIN,0.0_JPRD,& + CALL GEMM('T','N',IRANK,KF,IROWS,1.0_JPRD,& + & YNODE%B(1,1),IROWS,PVECIN(IFR,1),IRIN,0.0_JPRD,& & ZBETA(IBTST,1,IBETALV),ILBETA) END IF ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('T','N',IRANK,KF,IROWS,1.0_JPRM,& - & YNODE%B,IROWS,PVECIN(IFR,1),IRIN,0.0_JPRM,& + CALL GEMM('T','N',IRANK,KF,IROWS,1.0_JPRM,& + & YNODE%B(1,1),IROWS,PVECIN(IFR,1),IRIN,0.0_JPRM,& & ZBETA(IBTST,1,IBETALV),ILBETA) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) ENDIF ENDIF ILM1 = JL-1 @@ -867,7 +837,7 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) IF( IM <=0 ) CALL ABOR1('mult_butm: IM<=0 not allowed') IF(IN>0) THEN IF (LLDOUBLE.OR.(IKWV == 0)) THEN - IF(.not.LLDOUBLE) THEN + IF(.NOT.LLDOUBLE) THEN ALLOCATE(ZPNONIM_D(IM,IN)) II=0 DO JN=1,IN @@ -878,26 +848,21 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) ENDDO ZBETA_D(1:IM,1:KF)=REAL(ZBETA(IBTST:IBTST+IM-1,1:KF,IBETALV),JPRD) - CALL DGEMM('T','N',IN,KF,IM,1.0_JPRD,& + CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,& & ZPNONIM_D,IM,ZBETA_D,ILBETA,0.0_JPRD,& & ZOUT_D,YD_STRUCT%N_ORDER) ZVECOUT(YNODE%IRANK+1:YNODE%IRANK+IN,1:KF) = REAL(ZOUT_D(1:IN,1:KF),JPRM) DEALLOCATE(ZPNONIM_D) ELSE - CALL DGEMM('T','N',IN,KF,IM,1.0_JPRD,& + CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,& & YNODE%PNONIM(1),IM,ZBETA(IBTST,1,IBETALV),ILBETA,0.0_JPRD,& & ZVECOUT(YNODE%IRANK+1,1),YD_STRUCT%N_ORDER) ENDIF ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('T','N',IN,KF,IM,1.0_JPRM,& + CALL GEMM('T','N',IN,KF,IM,1.0_JPRM,& & YNODE%PNONIM(1),IM,ZBETA(IBTST,1,IBETALV),ILBETA,0.0_JPRM,& & ZVECOUT(YNODE%IRANK+1,1),YD_STRUCT%N_ORDER) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) ENDIF ENDIF DO JF=1,KF @@ -963,20 +928,9 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) ENDDO IF( IRANK <=0 ) CALL ABOR1('mult_butm: IRANK<=0 not allowed') IF(YNODE%ICOLS > IRANK) THEN - IF (LLDOUBLE) THEN - CALL DGEMM('N','N',IRANK,KF,IN,1.0_JPRD,& - & YNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YD_STRUCT%N_ORDER,1.0_JPRD,& + CALL GEMM('N','N',IRANK,KF,IN,1.0_JPRB,& + & YNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YD_STRUCT%N_ORDER,1.0_JPRB,& & ZBETA(IBTST,1,IBETALV),ILBETA) - ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('N','N',IRANK,KF,IN,1.0_JPRM,& - & YNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YD_STRUCT%N_ORDER,1.0_JPRM,& - & ZBETA(IBTST,1,IBETALV),ILBETA) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) - ENDIF ENDIF ELSE ILM1 = JL-1 @@ -1011,20 +965,9 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) ENDDO IF( IRANK <=0 ) CALL ABOR1('mult_butm: IRANK<=0 not allowed') IF(YNODE%ICOLS > IRANK) THEN - IF (LLDOUBLE) THEN - CALL DGEMM('N','N',IRANK,KF,IN,1.0_JPRD,& - & YNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YD_STRUCT%N_ORDER,1.0_JPRD,& + CALL GEMM('N','N',IRANK,KF,IN,1.0_JPRB,& + & YNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YD_STRUCT%N_ORDER,1.0_JPRB,& & ZBETA(IBTST,1,IBETALV),ILBETA) - ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('N','N',IRANK,KF,IN,1.0_JPRM,& - & YNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YD_STRUCT%N_ORDER,1.0_JPRM,& - & ZBETA(IBTST,1,IBETALV),ILBETA) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) - ENDIF ENDIF ENDIF IF( IRANK <=0 ) CALL ABOR1('mult_butm: IRANK<=0 not allowed') @@ -1032,20 +975,9 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) IFR = YNODE%IFROW ILR = YNODE%ILROW IROWS = YNODE%IROWS - IF (LLDOUBLE) THEN - CALL DGEMM('N','N',IROWS,KF,YNODE%IRANK,1.0_JPRD,& - & YNODE%B,IROWS,ZBETA(IBTST,1,IBETALV),YD_STRUCT%IBETALEN_MAX,0.0_JPRD,& + CALL GEMM('N','N',IROWS,KF,YNODE%IRANK,1.0_JPRB,& + & YNODE%B(1,1),IROWS,ZBETA(IBTST,1,IBETALV),YD_STRUCT%IBETALEN_MAX,0.0_JPRB,& & PVECOUT(IFR,1),IROUT) - ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('N','N',IROWS,KF,YNODE%IRANK,1.0_JPRM,& - & YNODE%B,IROWS,ZBETA(IBTST,1,IBETALV),YD_STRUCT%IBETALEN_MAX,0.0_JPRM,& - & PVECOUT(IFR,1),IROUT) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) - ENDIF ENDIF ENDDO ENDDO @@ -1079,10 +1011,10 @@ SUBROUTINE MULT_P(YDNODE,PVECIN,PVECOUT) IM = IRANK IN = YDNODE%ICOLS-IRANK IF (JPRB == JPRD) THEN - CALL DGEMV('N',IM,IN,1.0_JPRD,YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1),1,1.0_JPRD,ZVECOUT,1) + CALL GEMV('N',IM,IN,1.0_JPRD,YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1),1,1.0_JPRD,ZVECOUT(1),1) PVECOUT(:)=ZVECOUT(:) ELSE - CALL SGEMV('N',IM,IN,1.0_JPRM,YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1),1,1.0_JPRM,PVECOUT,1) + CALL GEMV('N',IM,IN,1.0_JPRM,YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1),1,1.0_JPRM,PVECOUT(1),1) ENDIF ENDIF @@ -1100,7 +1032,6 @@ SUBROUTINE MULT_PM(YDNODE,KF,KLBETA,PVECIN,PVECOUT) REAL(KIND=JPRB) :: ZVECIN(YDNODE%ICOLS,KF), ZVECOUT(SIZE(PVECOUT(:,1)),KF) INTEGER(KIND=JPIM) :: JN,IDX,IRANK,IM,IN,JF -LOGICAL :: LL_HALT_INVALID !--------------------------------------------------------- IRANK = YDNODE%IRANK @@ -1117,20 +1048,9 @@ SUBROUTINE MULT_PM(YDNODE,KF,KLBETA,PVECIN,PVECOUT) ENDDO ENDDO IF(YDNODE%ICOLS > IRANK) THEN - IF (JPRB == JPRD) THEN - CALL DGEMM('N','N',IRANK,KF,IN,1.0_JPRD,& - & YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YDNODE%ICOLS,1.0_JPRD,& - & PVECOUT,IRANK) - ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('N','N',IRANK,KF,IN,1.0_JPRM,& - & YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YDNODE%ICOLS,1.0_JPRM,& - & PVECOUT,IRANK) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) - ENDIF + CALL GEMM('N','N',IRANK,KF,IN,1.0_JPRB,& + & YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YDNODE%ICOLS,1.0_JPRB,& + & PVECOUT(1,1),IRANK) ENDIF END SUBROUTINE MULT_PM !================================================================== @@ -1151,9 +1071,9 @@ SUBROUTINE MULT_P_TR(YDNODE,PVECIN,PVECOUT) IM = IRANK IF (JPRB == JPRD) THEN ZVECIN(:) = PVECIN(:) - CALL DGEMV('T',IM,IN,1.0_JPRD,YDNODE%PNONIM,IRANK,ZVECIN,1,0.0_JPRD,ZVECOUT(IRANK+1),1) + CALL GEMV('T',IM,IN,1.0_JPRD,YDNODE%PNONIM(1),IRANK,ZVECIN(1),1,0.0_JPRD,ZVECOUT(IRANK+1),1) ELSE - CALL SGEMV('T',IM,IN,1.0_JPRM,YDNODE%PNONIM,IRANK,PVECIN,1,0.0_JPRM,ZVECOUT(IRANK+1),1) + CALL GEMV('T',IM,IN,1.0_JPRM,YDNODE%PNONIM(1),IRANK,PVECIN(1),1,0.0_JPRM,ZVECOUT(IRANK+1),1) ENDIF ENDIF DO JK=1,IRANK @@ -1178,8 +1098,6 @@ SUBROUTINE MULT_P_TRM(YDNODE,KF,PVECIN,PVECOUT) REAL(KIND=JPRB) :: ZVECOUT(YDNODE%ICOLS,KF), ZVECIN(SIZE(PVECIN(:,1)),KF) INTEGER(KIND=JPIM) :: JK,JN,IDX,IM,IN,JF -LOGICAL :: LL_HALT_INVALID - !------------------------------------------------------------------ IN = YDNODE%ICOLS-YDNODE%IRANK @@ -1187,18 +1105,9 @@ SUBROUTINE MULT_P_TRM(YDNODE,KF,PVECIN,PVECOUT) IF(IN>0) THEN IF (JPRB == JPRD) THEN ZVECIN(:,:) = PVECIN(:,:) - CALL DGEMM('T','N',IN,KF,IM,1.0_JPRD,& - & YDNODE%PNONIM(1),IM,ZVECIN,IM,0.0_JPRD,& - & ZVECOUT(YDNODE%IRANK+1,1),YDNODE%ICOLS) + CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,YDNODE%PNONIM(1),IM,ZVECIN(1,1),IM,0.0_JPRD,ZVECOUT(YDNODE%IRANK+1,1),YDNODE%ICOLS) ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('T','N',IN,KF,IM,1.0_JPRM,& - & YDNODE%PNONIM(1),IM,PVECIN,IM,0.0_JPRM,& - & ZVECOUT(YDNODE%IRANK+1,1),YDNODE%ICOLS) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) + CALL GEMM('T','N',IN,KF,IM,1.0_JPRM,YDNODE%PNONIM(1),IM,PVECIN(1,1),IM,0.0_JPRM,ZVECOUT(YDNODE%IRANK+1,1),YDNODE%ICOLS) ENDIF ENDIF DO JF=1,KF diff --git a/src/trans/algor/ectrans_blas_mod.F90 b/src/trans/algor/ectrans_blas_mod.F90 new file mode 100644 index 00000000..256836df --- /dev/null +++ b/src/trans/algor/ectrans_blas_mod.F90 @@ -0,0 +1,310 @@ +! (C) Copyright 2024- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +!==================================================================== +MODULE ECTRANS_BLAS_MOD +!==================================================================== +! Author: Willem Deconinck (ECMWF) +! +! This module provides interfaces for BLAS routines such as +! DGEMM/SGEMM and DGEMV/SGEMV +! The correct overload is used depending on the precision of the arguments +!==================================================================== + + +USE EC_PARKIND, ONLY : JPRD, JPRM, JPIM + +IMPLICIT NONE + +PRIVATE + +PUBLIC :: GEMM, GEMV + +!--------------------------------------------------------------------- + +INTERFACE GEMM +! GEMM performs one of the matrix-matrix operations +! +! C := alpha*op( A )*op( B ) + beta*C, +! +! where op( X ) is one of +! +! op( X ) = X or op( X ) = X**T, +! +! alpha and beta are scalars, and A, B and C are matrices, with op( A ) +! an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. + + ! SGEMM + MODULE PROCEDURE GEMM_SP ! Matrix arguments as array, (alpha,beta) in SP + MODULE PROCEDURE GEMM_SP_DP ! Matrix arguments as array, (alpha,beta) in DP + MODULE PROCEDURE GEMM_SCAL_SP ! Matrix arguments as scalar (address), (alpha,beta) in SP + MODULE PROCEDURE GEMM_SCAL_SP_DP ! Matrix arguments as scalar (address), (alpha,beta) in DP + + ! DGEMM + MODULE PROCEDURE GEMM_DP ! Matrix arguments as array, (alpha,beta) in DP + MODULE PROCEDURE GEMM_DP_SP ! Matrix arguments as array, (alpha,beta) in SP + MODULE PROCEDURE GEMM_SCAL_DP ! Matrix arguments as scalar (address), (alpha,beta) in DP + MODULE PROCEDURE GEMM_SCAL_DP_SP ! Matrix arguments as scalar (address), (alpha,beta) in SP +END INTERFACE + +!--------------------------------------------------------------------- + +INTERFACE GEMV +! GEMV performs one of the matrix-vector operations +! +! y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, +! +! where alpha and beta are scalars, x and y are vectors and A is an +! m by n matrix. + + ! SGEMV + MODULE PROCEDURE GEMV_SP ! Matrix/Vector arguments as array, (alpha,beta) in SP + MODULE PROCEDURE GEMV_SP_DP ! Matrix/Vector arguments as array, (alpha,beta) in DP + MODULE PROCEDURE GEMV_SCAL_SP ! Matrix/Vector arguments as scalar (address), (alpha,beta) in SP + MODULE PROCEDURE GEMV_SCAL_SP_DP ! Matrix/Vector arguments as scalar (address), (alpha,beta) in DP + + ! DGEMV + MODULE PROCEDURE GEMV_DP ! Matrix/Vector arguments as array, (alpha,beta) in DP + MODULE PROCEDURE GEMV_DP_SP ! Matrix/Vector arguments as array, (alpha,beta) in SP + MODULE PROCEDURE GEMV_SCAL_DP ! Matrix/Vector arguments as scalar (address), (alpha,beta) in DP + MODULE PROCEDURE GEMV_SCAL_DP_SP ! Matrix/Vector arguments as scalar (address), (alpha,beta) in SP +END INTERFACE + +!--------------------------------------------------------------------- + +!==================================================================== +CONTAINS +!==================================================================== + +SUBROUTINE GEMM_SCAL_DP(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + REAL(KIND=JPRD) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: K, LDA, LDB, LDC, M, N + CHARACTER ,INTENT(IN) :: TRANSA, TRANSB + REAL(KIND=JPRD) ,INTENT(IN) :: A, B + REAL(KIND=JPRD) ,INTENT(INOUT) :: C + + CALL DGEMM(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + +END SUBROUTINE GEMM_SCAL_DP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMM_SCAL_DP_SP(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + REAL(KIND=JPRM) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: K, LDA, LDB, LDC, M, N + CHARACTER ,INTENT(IN) :: TRANSA, TRANSB + REAL(KIND=JPRD) ,INTENT(IN) :: A, B + REAL(KIND=JPRD) ,INTENT(INOUT) :: C + + CALL GEMM_SCAL_DP(TRANSA, TRANSB, M, N, K, REAL(ALPHA,JPRD), A, LDA, B, LDB, REAL(BETA,JPRD), C, LDC) + +END SUBROUTINE GEMM_SCAL_DP_SP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMM_DP(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + REAL(KIND=JPRD) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: K, LDA, LDB, LDC, M, N + CHARACTER ,INTENT(IN) :: TRANSA, TRANSB + REAL(KIND=JPRD) ,INTENT(IN) :: A(LDA,*), B(LDB,*) + REAL(KIND=JPRD) ,INTENT(INOUT) :: C(LDC,*) + + CALL GEMM_SCAL_DP(TRANSA, TRANSB, M, N, K, ALPHA, A(1,1), LDA, B(1,1), LDB, BETA, C(1,1), LDC) + +END SUBROUTINE GEMM_DP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMM_DP_SP(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + REAL(KIND=JPRM) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: K, LDA, LDB, LDC, M, N + CHARACTER ,INTENT(IN) :: TRANSA, TRANSB + REAL(KIND=JPRD) ,INTENT(IN) :: A(LDA,*), B(LDB,*) + REAL(KIND=JPRD) ,INTENT(INOUT) :: C(LDC,*) + + CALL GEMM_SCAL_DP(TRANSA, TRANSB, M, N, K, REAL(ALPHA,JPRD), A(1,1), LDA, B(1,1), LDB, REAL(BETA,JPRD), C(1,1), LDC) + +END SUBROUTINE GEMM_DP_SP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMM_SCAL_SP(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + USE, INTRINSIC :: IEEE_EXCEPTIONS, ONLY : IEEE_GET_HALTING_MODE, IEEE_SET_HALTING_MODE, IEEE_INVALID + + REAL(KIND=JPRM) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: K, LDA, LDB, LDC, M, N + CHARACTER ,INTENT(IN) :: TRANSA, TRANSB + REAL(KIND=JPRM) ,INTENT(IN) :: A, B + REAL(KIND=JPRM) ,INTENT(INOUT) :: C + +#ifdef WITH_IEEE_HALT + LOGICAL, PARAMETER :: LL_IEEE_HALT = .TRUE. +#else + LOGICAL, PARAMETER :: LL_IEEE_HALT = .FALSE. +#endif + LOGICAL :: LL_HALT_INVALID = .FALSE. + + IF (LL_IEEE_HALT) THEN + CALL IEEE_GET_HALTING_MODE(IEEE_INVALID,LL_HALT_INVALID) + IF (LL_HALT_INVALID) CALL IEEE_SET_HALTING_MODE(IEEE_INVALID, .FALSE.) + ENDIF + + CALL SGEMM(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + + IF (LL_IEEE_HALT .AND. LL_HALT_INVALID) CALL IEEE_SET_HALTING_MODE(IEEE_INVALID, .TRUE.) + +END SUBROUTINE GEMM_SCAL_SP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMM_SCAL_SP_DP(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + REAL(KIND=JPRD) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: K, LDA, LDB, LDC, M, N + CHARACTER ,INTENT(IN) :: TRANSA, TRANSB + REAL(KIND=JPRM) ,INTENT(IN) :: A, B + REAL(KIND=JPRM) ,INTENT(INOUT) :: C + + CALL GEMM_SCAL_SP(TRANSA, TRANSB, M, N, K, REAL(ALPHA,JPRM), A, LDA, B, LDB, REAL(BETA,JPRM), C, LDC) + +END SUBROUTINE GEMM_SCAL_SP_DP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMM_SP(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + REAL(KIND=JPRM) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: K, LDA, LDB, LDC, M, N + CHARACTER ,INTENT(IN) :: TRANSA, TRANSB + REAL(KIND=JPRM) ,INTENT(IN) :: A(LDA,*), B(LDB,*) + REAL(KIND=JPRM) ,INTENT(INOUT) :: C(LDC,*) + + CALL GEMM_SCAL_SP(TRANSA, TRANSB, M, N, K, ALPHA, A(1,1), LDA, B(1,1), LDB, BETA, C(1,1), LDC) + +END SUBROUTINE GEMM_SP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMM_SP_DP(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + REAL(KIND=JPRD) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: K, LDA, LDB, LDC, M, N + CHARACTER ,INTENT(IN) :: TRANSA, TRANSB + REAL(KIND=JPRM) ,INTENT(IN) :: A(LDA,*), B(LDB,*) + REAL(KIND=JPRM) ,INTENT(INOUT) :: C(LDC,*) + + CALL GEMM_SCAL_SP(TRANSA, TRANSB, M, N, K, REAL(ALPHA,JPRM), A(1,1), LDA, B(1,1), LDB, REAL(BETA,JPRM), C(1,1), LDC) + +END SUBROUTINE GEMM_SP_DP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMV_SCAL_SP(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + REAL(KIND=JPRM) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: LDA, INCX, INCY, M, N + CHARACTER ,INTENT(IN) :: TRANS + REAL(KIND=JPRM) ,INTENT(IN) :: A, X + REAL(KIND=JPRM) ,INTENT(INOUT) :: Y + + CALL SGEMV(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + +END SUBROUTINE GEMV_SCAL_SP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMV_SCAL_SP_DP(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + REAL(KIND=JPRD) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: LDA, INCX, INCY, M, N + CHARACTER ,INTENT(IN) :: TRANS + REAL(KIND=JPRM) ,INTENT(IN) :: A, X + REAL(KIND=JPRM) ,INTENT(INOUT) :: Y + + CALL GEMV_SCAL_SP(TRANS, M, N, REAL(ALPHA,JPRM), A, LDA, X, INCX, REAL(BETA,JPRM), Y, INCY) + +END SUBROUTINE GEMV_SCAL_SP_DP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMV_SP(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + REAL(KIND=JPRM) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: LDA, INCX, INCY, M, N + CHARACTER ,INTENT(IN) :: TRANS + REAL(KIND=JPRM) ,INTENT(IN) :: A(:,:), X(:) + REAL(KIND=JPRM) ,INTENT(INOUT) :: Y(:) + + CALL GEMV_SCAL_SP(TRANS, M, N, ALPHA, A(1,1), LDA, X(1), INCX, BETA, Y(1), INCY) + +END SUBROUTINE GEMV_SP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMV_SP_DP(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + REAL(KIND=JPRD) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: LDA, INCX, INCY, M, N + CHARACTER ,INTENT(IN) :: TRANS + REAL(KIND=JPRM) ,INTENT(IN) :: A(:,:), X(:) + REAL(KIND=JPRM) ,INTENT(INOUT) :: Y(:) + + CALL GEMV_SCAL_SP(TRANS, M, N, REAL(ALPHA,JPRM), A(1,1), LDA, X(1), INCX, REAL(BETA,JPRM), Y(1), INCY) + +END SUBROUTINE GEMV_SP_DP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMV_SCAL_DP(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + REAL(KIND=JPRD) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: LDA, INCX, INCY, M, N + CHARACTER ,INTENT(IN) :: TRANS + REAL(KIND=JPRD) ,INTENT(IN) :: A, X + REAL(KIND=JPRD) ,INTENT(INOUT) :: Y + + CALL DGEMV(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + +END SUBROUTINE GEMV_SCAL_DP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMV_SCAL_DP_SP(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + REAL(KIND=JPRM) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: LDA, INCX, INCY, M, N + CHARACTER ,INTENT(IN) :: TRANS + REAL(KIND=JPRD) ,INTENT(IN) :: A, X + REAL(KIND=JPRD) ,INTENT(INOUT) :: Y + + CALL GEMV_SCAL_DP(TRANS, M, N, REAL(ALPHA,JPRD), A, LDA, X, INCX, REAL(BETA,JPRD), Y, INCY) + +END SUBROUTINE GEMV_SCAL_DP_SP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMV_DP(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + REAL(KIND=JPRD) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: LDA, INCX, INCY, M, N + CHARACTER ,INTENT(IN) :: TRANS + REAL(KIND=JPRD) ,INTENT(IN) :: A(:,:), X(:) + REAL(KIND=JPRD) ,INTENT(INOUT) :: Y(:) + + CALL GEMV_SCAL_DP(TRANS, M, N, ALPHA, A(1,1), LDA, X(1), INCX, BETA, Y(1), INCY) + +END SUBROUTINE GEMV_DP + +!--------------------------------------------------------------------- + +SUBROUTINE GEMV_DP_SP(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY) + REAL(KIND=JPRM) ,INTENT(IN) :: ALPHA, BETA + INTEGER(KIND=JPIM) ,INTENT(IN) :: LDA, INCX, INCY, M, N + CHARACTER ,INTENT(IN) :: TRANS + REAL(KIND=JPRD) ,INTENT(IN) :: A(:,:), X(:) + REAL(KIND=JPRD) ,INTENT(INOUT) :: Y(:) + + CALL GEMV_SCAL_DP(TRANS, M, N, REAL(ALPHA,JPRD), A(1,1), LDA, X(1), INCX, REAL(BETA,JPRD), Y(1), INCY) + +END SUBROUTINE GEMV_DP_SP + +!==================================================================== + +END MODULE ECTRANS_BLAS_MOD + diff --git a/src/trans/internal/ledir_mod.F90 b/src/trans/internal/ledir_mod.F90 index 2fa0ee5e..9c0e4e00 100644 --- a/src/trans/internal/ledir_mod.F90 +++ b/src/trans/internal/ledir_mod.F90 @@ -61,9 +61,7 @@ SUBROUTINE LEDIR(KM,KMLOC,KFC,KIFC,KSL,KDGLU,KLED2,PAIA,PSIA,POA1,PW) USE TPM_DIM ,ONLY : R USE TPM_FLT ,ONLY : S USE BUTTERFLY_ALG_MOD, ONLY : MULT_BUTM - -use, intrinsic :: ieee_exceptions - +USE ECTRANS_BLAS_MOD, ONLY : GEMM IMPLICIT NONE @@ -86,12 +84,6 @@ SUBROUTINE LEDIR(KM,KMLOC,KFC,KIFC,KSL,KDGLU,KLED2,PAIA,PSIA,POA1,PW) INTEGER(KIND=JPIM) :: ITHRESHOLD REAL(KIND=JPRB) :: ZB(KDGLU,KIFC), ZCA((R%NTMAX-KM+2)/2,KIFC), ZCS((R%NTMAX-KM+3)/2,KIFC) REAL(KIND=JPRD), allocatable :: ZB_D(:,:), ZCA_D(:,:), ZCS_D(:,:),ZRPNMA(:,:), ZRPNMS(:,:) -LOGICAL :: LL_HALT_INVALID -#ifdef WITH_IEEE_HALT -LOGICAL, PARAMETER :: LL_IEEE_HALT = .TRUE. -#else -LOGICAL, PARAMETER :: LL_IEEE_HALT = .FALSE. -#endif LOGICAL, PARAMETER :: LLDOUBLE = (JPRB == JPRD) CHARACTER(LEN=1) :: CLX REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -136,17 +128,12 @@ SUBROUTINE LEDIR(KM,KMLOC,KFC,KIFC,KSL,KDGLU,KLED2,PAIA,PSIA,POA1,PW) IF (LHOOK) CALL DR_HOOK('LEDIR_'//CLX//'GEMM_1',0,ZHOOK_HANDLE) IF (LLDOUBLE) THEN - CALL DGEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRD,S%FA(KMLOC)%RPNMA,KDGLU,& + CALL GEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRD,S%FA(KMLOC)%RPNMA,KDGLU,& &ZB,KDGLU,0._JPRD,ZCA,ILA) ELSE IF(KM>=1)THEN ! DGEM for the mean to improve mass conservation - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRM,S%FA(KMLOC)%RPNMA,KDGLU,& + CALL GEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRM,S%FA(KMLOC)%RPNMA,KDGLU,& &ZB,KDGLU,0._JPRM,ZCA,ILA) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) ELSE I1 = size(S%FA(KMLOC)%RPNMA(:,1)) I2 = size(S%FA(KMLOC)%RPNMA(1,:)) @@ -165,7 +152,7 @@ SUBROUTINE LEDIR(KM,KMLOC,KFC,KIFC,KSL,KDGLU,KLED2,PAIA,PSIA,POA1,PW) ZRPNMA(I3,I4) = S%FA(KMLOC)%RPNMA(I3,I4) END DO END DO - CALL DGEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRD,ZRPNMA,KDGLU,& + CALL GEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRD,ZRPNMA,KDGLU,& &ZB_D,KDGLU,0._JPRD,ZCA_D,ILA) IFLD=0 DO JK=1,KFC,ISKIP @@ -210,17 +197,12 @@ SUBROUTINE LEDIR(KM,KMLOC,KFC,KIFC,KSL,KDGLU,KLED2,PAIA,PSIA,POA1,PW) IF (LHOOK) CALL DR_HOOK('LEDIR_'//CLX//'GEMM_2',0,ZHOOK_HANDLE) IF (LLDOUBLE) THEN - CALL DGEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRD,S%FA(KMLOC)%RPNMS,KDGLU,& + CALL GEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRD,S%FA(KMLOC)%RPNMS,KDGLU,& &ZB,KDGLU,0._JPRD,ZCS,ILS) ELSE IF(KM>=1)THEN ! DGEM for the mean to improve mass conservation - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRM,S%FA(KMLOC)%RPNMS,KDGLU,& + CALL GEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRM,S%FA(KMLOC)%RPNMS,KDGLU,& &ZB,KDGLU,0._JPRM,ZCS,ILS) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) ELSE I1 = size(S%FA(KMLOC)%RPNMS(:,1)) I2 = size(S%FA(KMLOC)%RPNMS(1,:)) @@ -239,7 +221,7 @@ SUBROUTINE LEDIR(KM,KMLOC,KFC,KIFC,KSL,KDGLU,KLED2,PAIA,PSIA,POA1,PW) ZRPNMS(I3,I4) = S%FA(KMLOC)%RPNMS(I3,I4) END DO END DO - CALL DGEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRD,ZRPNMS,KDGLU,& + CALL GEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRD,ZRPNMS,KDGLU,& &ZB_D,KDGLU,0._JPRD,ZCS_D,ILS) IFLD=0 DO JK=1,KFC,ISKIP diff --git a/src/trans/internal/ledirad_mod.F90 b/src/trans/internal/ledirad_mod.F90 index fa8a3b95..96decfe8 100644 --- a/src/trans/internal/ledirad_mod.F90 +++ b/src/trans/internal/ledirad_mod.F90 @@ -68,6 +68,7 @@ SUBROUTINE LEDIRAD(KM,KMLOC,KFC,KIFC,KDGLU,KLED2,PAIA,PSIA,POA1) USE TPM_FLT ,ONLY : S USE TPM_FIELDS ,ONLY : F USE BUTTERFLY_ALG_MOD ,ONLY: MULT_BUTM +USE ECTRANS_BLAS_MOD, ONLY : GEMM IMPLICIT NONE @@ -86,7 +87,6 @@ SUBROUTINE LEDIRAD(KM,KMLOC,KFC,KIFC,KDGLU,KLED2,PAIA,PSIA,POA1) INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISKIP, ISL, J, JK,JGL,J1 INTEGER(KIND=JPIM) :: IFLD,ITHRESHOLD REAL(KIND=JPRB) :: ZB(KDGLU,KIFC), ZCA((R%NTMAX-KM+2)/2,KIFC), ZCS((R%NTMAX-KM+3)/2,KIFC) -LOGICAL, PARAMETER :: LLDOUBLE = (JPRD == JPRB) CHARACTER(LEN=1) :: CLX REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -98,7 +98,7 @@ SUBROUTINE LEDIRAD(KM,KMLOC,KFC,KIFC,KDGLU,KLED2,PAIA,PSIA,POA1) !* 1.1 PREPARATIONS. CLX = 'S' -IF (LLDOUBLE) CLX = 'D' +IF (JPRD == JPRB) CLX = 'D' IA = 1+MOD(R%NTMAX-KM+2,2) IS = 1+MOD(R%NTMAX-KM+1,2) @@ -134,15 +134,9 @@ SUBROUTINE LEDIRAD(KM,KMLOC,KFC,KIFC,KDGLU,KLED2,PAIA,PSIA,POA1) ENDDO IF(ILA <= ITHRESHOLD .OR. .NOT.S%LUSEFLT) THEN - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',0,ZHOOK_HANDLE) - IF(LLDOUBLE)THEN - CALL DGEMM('N','N',KDGLU,KIFC,ILA,1.0_JPRD,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZCA,ILA,0._JPRD,ZB,KDGLU) - ELSE - CALL SGEMM('N','N',KDGLU,KIFC,ILA,1.0_JPRM,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZCA,ILA,0._JPRM,ZB,KDGLU) - END IF - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',1,ZHOOK_HANDLE) + IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',0,ZHOOK_HANDLE) + CALL GEMM('N','N',KDGLU,KIFC,ILA,1.0_JPRB,S%FA(KMLOC)%RPNMA,KDGLU,ZCA,ILA,0._JPRB,ZB,KDGLU) + IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',1,ZHOOK_HANDLE) ELSE @@ -173,14 +167,7 @@ SUBROUTINE LEDIRAD(KM,KMLOC,KFC,KIFC,KDGLU,KLED2,PAIA,PSIA,POA1) IF(ILS <= ITHRESHOLD .OR. .NOT.S%LUSEFLT) THEN IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_2',0,ZHOOK_HANDLE) - IF(LLDOUBLE)THEN - CALL DGEMM('N','N',KDGLU,KIFC,ILS,1.0_JPRD,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZCS,ILS,0._JPRD,ZB,KDGLU) - ELSE - CALL SGEMM('N','N',KDGLU,KIFC,ILS,1.0_JPRM,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZCS,ILS,0._JPRM,ZB,KDGLU) - - END IF + CALL GEMM('N','N',KDGLU,KIFC,ILS,1.0_JPRD,S%FA(KMLOC)%RPNMS,KDGLU,ZCS,ILS,0._JPRD,ZB,KDGLU) IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_2',1,ZHOOK_HANDLE) ELSE diff --git a/src/trans/internal/leinv_mod.F90 b/src/trans/internal/leinv_mod.F90 index a753c96c..5d6a3308 100644 --- a/src/trans/internal/leinv_mod.F90 +++ b/src/trans/internal/leinv_mod.F90 @@ -50,7 +50,7 @@ SUBROUTINE LEINV(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KSL,KDGLU,PIA,PAOA1,PSOA1) ! ! Modifications. ! -------------- -! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: +! J.Hague : Oct 2012 DR_HOOK round calls to GEMM: ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ @@ -60,8 +60,7 @@ SUBROUTINE LEINV(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KSL,KDGLU,PIA,PAOA1,PSOA1) USE TPM_DIM ,ONLY : R USE TPM_FLT ,ONLY : S USE BUTTERFLY_ALG_MOD, ONLY : MULT_BUTM - -use, intrinsic :: ieee_exceptions +USE ECTRANS_BLAS_MOD, ONLY : GEMM IMPLICIT NONE @@ -80,13 +79,6 @@ SUBROUTINE LEINV(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KSL,KDGLU,PIA,PAOA1,PSOA1) INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISKIP, ISL, J1, IFLD, JGL,JK, J,JI, IEND INTEGER(KIND=JPIM) :: ITHRESHOLD REAL(KIND=JPRB) :: ZBA((R%NSMAX-KM+2)/2,KIFC), ZBS((R%NSMAX-KM+3)/2,KIFC), ZC(KDGLU,KIFC) -LOGICAL :: LL_HALT_INVALID -#ifdef WITH_IEEE_HALT -LOGICAL, PARAMETER :: LL_IEEE_HALT = .TRUE. -#else -LOGICAL, PARAMETER :: LL_IEEE_HALT = .FALSE. -#endif -LOGICAL, PARAMETER :: LLDOUBLE = (JPRB == JPRD) CHARACTER(LEN=1) :: CLX REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -98,7 +90,7 @@ SUBROUTINE LEINV(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KSL,KDGLU,PIA,PAOA1,PSOA1) !* 1.1 PREPARATIONS. CLX = 'S' -IF (LLDOUBLE) CLX = 'D' +IF (JPRB == JPRD) CLX = 'D' !ISL = MAX(R%NDGNH-G%NDGLU(KM)+1,1) ISL = KSL @@ -138,20 +130,9 @@ SUBROUTINE LEINV(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KSL,KDGLU,PIA,PAOA1,PSOA1) IF(ILA <= ITHRESHOLD .OR. .NOT.S%LUSEFLT) THEN IF (LHOOK) CALL DR_HOOK('LEINV_'//CLX//'GEMM_1',0,ZHOOK_HANDLE) - IF (LLDOUBLE) THEN - CALL DGEMM('N','N',KDGLU,KIFC,ILA,1.0_JPRD,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZBA,ILA,0._JPRD,ZC,KDGLU) - ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('N','N',KDGLU,KIFC,ILA,1.0_JPRM,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZBA,ILA,0._JPRM,ZC,KDGLU) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) - ENDIF + CALL GEMM('N','N',KDGLU,KIFC,ILA,1.0_JPRB,S%FA(KMLOC)%RPNMA,KDGLU,ZBA,ILA,0._JPRB,ZC,KDGLU) IF (LHOOK) CALL DR_HOOK('LEINV_'//CLX//'GEMM_1',1,ZHOOK_HANDLE) - + ELSE IF (LHOOK) CALL DR_HOOK('LEINV_'//CLX//'BUTM_1',0,ZHOOK_HANDLE) @@ -182,18 +163,7 @@ SUBROUTINE LEINV(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KSL,KDGLU,PIA,PAOA1,PSOA1) IF(ILS <= ITHRESHOLD .OR. .NOT.S%LUSEFLT ) THEN IF (LHOOK) CALL DR_HOOK('LEINV_'//CLX//'GEMM_2',0,ZHOOK_HANDLE) - IF (LLDOUBLE) THEN - CALL DGEMM('N','N',KDGLU,KIFC,ILS,1.0_JPRD,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZBS,ILS,0._JPRD,ZC,KDGLU) - ELSE - IF (LL_IEEE_HALT) THEN - call ieee_get_halting_mode(ieee_invalid,LL_HALT_INVALID) - if (LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.false.) - ENDIF - CALL SGEMM('N','N',KDGLU,KIFC,ILS,1.0_JPRM,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZBS,ILS,0._JPRM,ZC,KDGLU) - if (LL_IEEE_HALT .and. LL_HALT_INVALID) call ieee_set_halting_mode(ieee_invalid,.true.) - ENDIF + CALL GEMM('N','N',KDGLU,KIFC,ILS,1.0_JPRB,S%FA(KMLOC)%RPNMS,KDGLU,ZBS,ILS,0._JPRB,ZC,KDGLU) IF (LHOOK) CALL DR_HOOK('LEINV_'//CLX//'GEMM_2',1,ZHOOK_HANDLE) ELSE diff --git a/src/trans/internal/leinvad_mod.F90 b/src/trans/internal/leinvad_mod.F90 index b6f37c01..9eb8f95a 100644 --- a/src/trans/internal/leinvad_mod.F90 +++ b/src/trans/internal/leinvad_mod.F90 @@ -62,6 +62,7 @@ SUBROUTINE LEINVAD(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KDGLU,PIA,PAOA1,PSOA1) ! USE TPM_FLT ,ONLY : S USE BUTTERFLY_ALG_MOD,ONLY : MULT_BUTM +USE ECTRANS_BLAS_MOD, ONLY : GEMM IMPLICIT NONE @@ -79,7 +80,6 @@ SUBROUTINE LEINVAD(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KDGLU,PIA,PAOA1,PSOA1) INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISKIP, ISL, IOAD1, JK,JI INTEGER(KIND=JPIM) :: IFLD,ITHRESHOLD REAL(KIND=JPRB) :: ZBA((R%NSMAX-KM+2)/2,KIFC), ZBS((R%NSMAX-KM+3)/2,KIFC), ZC(KDGLU,KIFC) -LOGICAL, PARAMETER :: LLDOUBLE = (JPRD == JPRB) CHARACTER(LEN=1) :: CLX REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -91,7 +91,7 @@ SUBROUTINE LEINVAD(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KDGLU,PIA,PAOA1,PSOA1) !* 1.1 PREPARATIONS. CLX = 'S' -IF (LLDOUBLE) CLX = 'D' +IF (JPRD == JPRB) CLX = 'D' IA = 1+MOD(R%NSMAX-KM+2,2) IS = 1+MOD(R%NSMAX-KM+1,2) @@ -123,15 +123,9 @@ SUBROUTINE LEINVAD(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KDGLU,PIA,PAOA1,PSOA1) ENDDO IF(ILA <= ITHRESHOLD .OR. .NOT.S%LUSEFLT) THEN - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',0,ZHOOK_HANDLE) - IF(LLDOUBLE)THEN - CALL DGEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRD,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZC,KDGLU,0._JPRD,ZBA,ILA) - ELSE - CALL SGEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRM,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZC,KDGLU,0._JPRM,ZBA,ILA) - END IF - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',1,ZHOOK_HANDLE) + IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',0,ZHOOK_HANDLE) + CALL GEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRB,S%FA(KMLOC)%RPNMA,KDGLU,ZC,KDGLU,0._JPRB,ZBA,ILA) + IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',1,ZHOOK_HANDLE) ELSE @@ -161,13 +155,7 @@ SUBROUTINE LEINVAD(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KDGLU,PIA,PAOA1,PSOA1) IF(ILS <= ITHRESHOLD .OR. .NOT.S%LUSEFLT ) THEN IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_2',0,ZHOOK_HANDLE) - IF(LLDOUBLE)THEN - CALL DGEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRD,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZC,KDGLU,0._JPRD,ZBS,ILS) - ELSE - CALL SGEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRM,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZC,KDGLU,0._JPRM,ZBS,ILS) - END IF + CALL GEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRB,S%FA(KMLOC)%RPNMS,KDGLU,ZC,KDGLU,0._JPRB,ZBS,ILS) IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_2',1,ZHOOK_HANDLE) ELSE From cc5db9862c097fe734a087f609498b5c15fdd682 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 7 Mar 2024 09:09:27 +0000 Subject: [PATCH 12/48] Fix typo in ordering of fields in ZGMV --- src/programs/ectrans-benchmark.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index ff7dae22..b0a2db44 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -495,8 +495,8 @@ program transform_test jend_vder_EW = jend_uv endif -jbegin_sc = jbegin_vder_EW + 1 -jend_sc = jbegin_vder_EW + nfld +jbegin_sc = jend_vder_EW + 1 +jend_sc = jend_vder_EW + nfld if (lscders) then ndimgmvs = 3 From 9a3055f38b031eae57d8a6b5b4f08340206c8c36 Mon Sep 17 00:00:00 2001 From: DJDavies2 Date: Wed, 6 Mar 2024 17:00:30 +0000 Subject: [PATCH 13/48] Copy original files instead of symbolic link. --- src/transi/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transi/CMakeLists.txt b/src/transi/CMakeLists.txt index 8ae2b746..43e79dfa 100644 --- a/src/transi/CMakeLists.txt +++ b/src/transi/CMakeLists.txt @@ -27,7 +27,7 @@ ecbuild_add_library( TARGET transi_dp ectrans_target_fortran_module_directory( TARGET transi_dp MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/module/trans_dp ) -file( GLOB transi_includes include/ectrans/* ) +set( transi_includes transi.h version.h ) install( FILES ${transi_includes} DESTINATION include/ectrans From 625a1c9d13b0c94f867c61cb00e394e64cc8c359 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab <113430901+awnawab@users.noreply.github.com> Date: Mon, 25 Mar 2024 15:09:37 +0100 Subject: [PATCH 14/48] ECTRANS: set LINKER_LANGUAGE for tests (#71) --- tests/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bf06ec83..ead36975 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -131,53 +131,63 @@ if( HAVE_TRANSI ) ecbuild_add_test( TARGET ectrans_test_transi_program SOURCES transi/transi_test_program.c LIBS ectrans_test + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) ecbuild_add_test( TARGET ectrans_test_transi_timings SOURCES transi/transi_test_timings.c LIBS ectrans_test + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) ecbuild_add_test( TARGET ectrans_test_transi_lonlat SOURCES transi/transi_test_lonlat.c LIBS ectrans_test + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) ecbuild_add_test( TARGET ectrans_test_transi_io SOURCES transi/transi_test_io.c LIBS ectrans_test + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) ecbuild_add_test( TARGET ectrans_test_transi_memory SOURCES transi/transi_test_memory.c LIBS ectrans_test CONDITION EC_HAVE_MALLOC_H + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) ecbuild_add_test( TARGET ectrans_test_transi_memory_lonlat SOURCES transi/transi_test_memory_lonlat.c LIBS ectrans_test CONDITION EC_HAVE_MALLOC_H + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) ecbuild_add_test( TARGET ectrans_test_transi_vordiv_to_UV SOURCES transi/transi_test_vordiv_to_UV.c LIBS ectrans_test + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) ecbuild_add_test( TARGET ectrans_test_transi_dirtrans_adjoint SOURCES transi/transi_test_dirtrans_adjoint.c LIBS ectrans_test + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) ecbuild_add_test( TARGET ectrans_test_transi_invtrans_adjoint SOURCES transi/transi_test_invtrans_adjoint.c LIBS ectrans_test + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) ecbuild_add_test( TARGET ectrans_test_transi_lonlat_diff_incr SOURCES transi/transi_test_lonlat_diff_incr.c LIBS ectrans_test + LINKER_LANGUAGE C ENVIRONMENT TRANS_USE_MPI=0 ) if( HAVE_TESTS ) From 64cfdec6f42e0b5c361d17360c918dd7cc86c264 Mon Sep 17 00:00:00 2001 From: DJDavies2 Date: Fri, 29 Mar 2024 10:44:15 +0000 Subject: [PATCH 15/48] Use flush statement. --- src/programs/ectrans-benchmark.F90 | 2 +- src/trans/internal/abort_trans_mod.F90 | 2 +- tests/trans/test_adjoint.F90 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index b0a2db44..b680f3d9 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -711,7 +711,7 @@ program transform_test zmaxerr(:) = -999.0 do ifld = 1, 1 write(nout,*) "znormsp", znormsp - call flush(nout) + flush(nout) zerr(1) = abs(znormsp1(ifld)/znormsp(ifld) - 1.0_jprb) zmaxerr(1) = max(zmaxerr(1), zerr(1)) enddo diff --git a/src/trans/internal/abort_trans_mod.F90 b/src/trans/internal/abort_trans_mod.F90 index aebc5b8a..3efe7f95 100644 --- a/src/trans/internal/abort_trans_mod.F90 +++ b/src/trans/internal/abort_trans_mod.F90 @@ -30,7 +30,7 @@ SUBROUTINE ABORT_TRANS(CDTEXT) CALL MPL_ABORT(CDTEXT) ELSE CALL SDL_TRACEBACK - CALL FLUSH(0) + FLUSH(0) CALL SDL_SRLABORT ENDIF diff --git a/tests/trans/test_adjoint.F90 b/tests/trans/test_adjoint.F90 index 0ef987ac..fae52c50 100644 --- a/tests/trans/test_adjoint.F90 +++ b/tests/trans/test_adjoint.F90 @@ -140,7 +140,7 @@ PROGRAM TEST_ADJOINT ENDDO WRITE(NOUT,*)' NFLEV=',NFLEV WRITE(NOUT,*) 'SETUP FINISHED' -CALL FLUSH(NOUT) +FLUSH(NOUT) ALLOCATE(ZSPECX(NFLEV,NSPEC2)) ALLOCATE(ZSPECY(NFLEV,NSPEC2)) From 9d1c2c660e4050cbb32a16c07beb65c3cac19a6d Mon Sep 17 00:00:00 2001 From: DJDavies2 Date: Fri, 29 Mar 2024 11:02:14 +0000 Subject: [PATCH 16/48] Rearrange declarations. --- src/programs/ectrans-benchmark.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index b0a2db44..94824e98 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -1138,8 +1138,8 @@ end subroutine str2int subroutine sort(a, n) - real(kind=jprd), intent(inout) :: a(n) integer(kind=jpim), intent(in) :: n + real(kind=jprd), intent(inout) :: a(n) real(kind=jprd) :: x From 895babf3a2d95c17054060b41c8a5fbf3552c367 Mon Sep 17 00:00:00 2001 From: DJDavies2 Date: Fri, 29 Mar 2024 14:56:49 +0000 Subject: [PATCH 17/48] Fix format. --- src/programs/ectrans-benchmark.F90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index b0a2db44..05f6ae80 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -435,10 +435,10 @@ program transform_test write(nout,'("ngpblks ",i0)') ngpblks write(nout,'("nspec2 ",i0)') nspec2 write(nout,'("nspec2g ",i0)') nspec2g - write(nout,'("luseflt ",l)') luseflt - write(nout,'("lvordiv ",l)') lvordiv - write(nout,'("lscders ",l)') lscders - write(nout,'("luvders ",l)') luvders + write(nout,'("luseflt ",l1)') luseflt + write(nout,'("lvordiv ",l1)') lvordiv + write(nout,'("lscders ",l1)') lscders + write(nout,'("luvders ",l1)') luvders write(nout,'(" ")') write(nout,'(a)') '======= End of runtime parameters =======' write(nout,'(" ")') From a501b7faac605ab71cfc594576a5b26b9064659c Mon Sep 17 00:00:00 2001 From: DJDavies2 Date: Fri, 29 Mar 2024 17:12:36 +0000 Subject: [PATCH 18/48] Fix array out of bounds. --- src/programs/ectrans-benchmark.F90 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index b0a2db44..886979ff 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -537,7 +537,9 @@ program transform_test call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor1, kvset=ivset(1:nflevg)) call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv1, kvset=ivset(1:nflevg)) - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt1, kvset=ivset(1:nflevg)) + if (nfld > 0) then + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt1, kvset=ivset(1:nflevg)) + endif call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp1, kvset=ivsetsc) if (verbosity >= 1) then From 207a686a7894d8a8dc46f71d46687cabbe15cbe6 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 16 Apr 2024 09:35:28 +0000 Subject: [PATCH 19/48] Add logic to avoid reallocating ZCOMBUF[RS] at each call This is taken from orphaned ifs-source PR https://git.ecmwf.int/projects/IFS/repos/ifs-source/pull-requests/1083/overview. Co-authored-by: Olivier Marsden --- src/trans/internal/trgtol_mod.F90 | 53 +++++++++++++++++++++++-------- src/trans/internal/trltog_mod.F90 | 37 ++++++++++++++++----- 2 files changed, 68 insertions(+), 22 deletions(-) diff --git a/src/trans/internal/trgtol_mod.F90 b/src/trans/internal/trgtol_mod.F90 index 064da76a..0d04d288 100644 --- a/src/trans/internal/trgtol_mod.F90 +++ b/src/trans/internal/trgtol_mod.F90 @@ -292,23 +292,44 @@ SUBROUTINE TRGTOL_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRB), ALLOCATABLE :: ZCOMBUFS(:,:) -REAL(KIND=JPRB), ALLOCATABLE :: ZCOMBUFR(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR(:,:) +INTEGER(KIND=JPIM), SAVE :: INRECV_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: INSEND_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: IRECVCOUNT_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: ISENDCOUNT_PREV = -1 + +IF ( .NOT. ALLOCATED(ZCOMBUFS) ) THEN + ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + ISENDCOUNT_PREV = KSENDCOUNT + INSEND_PREV = KNSEND +ELSEIF ( KSENDCOUNT .NE. ISENDCOUNT_PREV .OR. KNSEND .NE. INSEND_PREV ) THEN + DEALLOCATE(ZCOMBUFS) + ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + ISENDCOUNT_PREV = KSENDCOUNT + INSEND_PREV = KNSEND +ENDIF -ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) ! Now, force the OS to allocate this shared array right now, not when it starts to be used which is ! an OPEN-MP loop, that would cause a threads synchronization lock : -IF (KNSEND > 0 .AND. KSENDCOUNT >=-1) ZCOMBUFS(-1,1)=HUGE(1._JPRB) -ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) +IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS(-1,1) = HUGE(1._JPRB) + +IF ( .NOT. ALLOCATED(ZCOMBUFR) ) THEN + ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + IRECVCOUNT_PREV = KRECVCOUNT + INRECV_PREV = KNRECV +ELSEIF ( KRECVCOUNT .NE. IRECVCOUNT_PREV .OR. KNRECV .NE. INRECV_PREV ) THEN + DEALLOCATE(ZCOMBUFR) + ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + IRECVCOUNT_PREV = KRECVCOUNT + INRECV_PREV = KNRECV +ENDIF CALL TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& & ZCOMBUFS,ZCOMBUFR, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2) -DEALLOCATE(ZCOMBUFR) -DEALLOCATE(ZCOMBUFS) - END SUBROUTINE TRGTOL_COMM_HEAP SUBROUTINE TRGTOL_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& @@ -796,8 +817,9 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& !....Pack+send loop......................................................... -!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JBLK,IFIRST,ILAST,& -!$OMP& INS,ISEND,ISETA,ISETB,ISETV,IFLD,IPOS,JFLD) +!$OMP PARALLEL PRIVATE(JBLK,IFIRST,ILAST,ISEND_FLD_START,ISEND_FLD_END,INS,ISEND,ISETA,ISETB,& +!$OMP& ISETV,IFLD,IFLDT,IPOS,JFLD,JK,JJ,JI) +!$OMP DO SCHEDULE(STATIC) DO INS=1,KNSEND ISEND=KSEND(INS) CALL PE2SET(ISEND,ISETA,ISETB,ISETW(INS),ISETV) @@ -825,8 +847,7 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& PCOMBUFS(0,INS) = IFLD ENDDO -!$OMP END PARALLEL DO - +!$OMP END DO DO INS=1,KNSEND ISEND=KSEND(INS) IPOS=IPOSPLUS(INS) @@ -834,7 +855,7 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& ISEND_FLD_START=PCOMBUFS(-1,INS) ISEND_FLD_END = PCOMBUFS(0,INS) -!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(IFLDT,JBLK,IFIRST,ILAST,JK,JJ,JI) +!$OMP DO SCHEDULE(STATIC) DO JJ=ISEND_FLD_START,ISEND_FLD_END IFLDT=IFLDA(JJ,INS) DO JBLK=1,NGPBLKS @@ -877,8 +898,12 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& ENDIF ENDDO ENDDO -!$OMP END PARALLEL DO +!$OMP END DO +ENDDO +!$OMP END PARALLEL +DO INS=1,KNSEND + ISEND=KSEND(INS) IF (NTRANS_SYNC_LEVEL <= 1) THEN CALL MPL_SEND(PCOMBUFS(-1:KSENDTOT(ISEND),INS),KDEST=NPRCIDS(ISEND), & & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ_SEND(INS), & diff --git a/src/trans/internal/trltog_mod.F90 b/src/trans/internal/trltog_mod.F90 index b50f55ed..4dbe2f90 100644 --- a/src/trans/internal/trltog_mod.F90 +++ b/src/trans/internal/trltog_mod.F90 @@ -315,14 +315,38 @@ SUBROUTINE TRLTOG_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& INTEGER(KIND=JPIM), INTENT(IN) :: KSETWL(NPROC) INTEGER(KIND=JPIM), INTENT(IN) :: KSETVL(NPROC) -REAL(KIND=JPRB), ALLOCATABLE :: ZCOMBUFS(:,:) -REAL(KIND=JPRB), ALLOCATABLE :: ZCOMBUFR(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR(:,:) +INTEGER(KIND=JPIM), SAVE :: INRECV_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: INSEND_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: IRECVCOUNT_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: ISENDCOUNT_PREV = -1 + +IF ( .NOT. ALLOCATED(ZCOMBUFS) ) THEN + ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + ISENDCOUNT_PREV = KSENDCOUNT + INSEND_PREV = KNSEND +ELSEIF ( KSENDCOUNT .NE. ISENDCOUNT_PREV .OR. KNSEND .NE. INSEND_PREV ) THEN + DEALLOCATE(ZCOMBUFS) + ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + ISENDCOUNT_PREV = KSENDCOUNT + INSEND_PREV = KNSEND +ENDIF -ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) ! Now, force the OS to allocate this shared array right now, not when it starts to be used which is ! an OPEN-MP loop, that would cause a threads synchronization lock : -IF (KNSEND > 0 .AND. KSENDCOUNT >=-1) ZCOMBUFS(-1,1)=HUGE(1._JPRB) -ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) +IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS(-1,1) = HUGE(1._JPRB) + +IF ( .NOT. ALLOCATED(ZCOMBUFR) ) THEN + ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + IRECVCOUNT_PREV = KRECVCOUNT + INRECV_PREV = KNRECV +ELSEIF ( KRECVCOUNT .NE. IRECVCOUNT_PREV .OR. KNRECV .NE. INRECV_PREV ) THEN + DEALLOCATE(ZCOMBUFR) + ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + IRECVCOUNT_PREV = KRECVCOUNT + INRECV_PREV = KNRECV +ENDIF CALL TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& @@ -330,9 +354,6 @@ SUBROUTINE TRLTOG_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, & & KSETAL, KSETBL,KSETWL,KSETVL) -DEALLOCATE(ZCOMBUFR) -DEALLOCATE(ZCOMBUFS) - END SUBROUTINE TRLTOG_COMM_HEAP SUBROUTINE TRLTOG_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& From 4bde94f990e1e3410735fc8acae7da781ef93a78 Mon Sep 17 00:00:00 2001 From: David Davies Date: Mon, 22 Apr 2024 13:10:41 +0100 Subject: [PATCH 20/48] Enable more options to be passed to ctest. (#80) --- tests/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f0f59820..7b86f807 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -22,6 +22,14 @@ if( HAVE_TESTS ) list( APPEND _test_args "-DCMAKE_${lang}_COMPILER=${CMAKE_${lang}_COMPILER}" ) endif() endforeach() + foreach( lang C CXX Fortran ) + if( CMAKE_${lang}_FLAGS ) + list( APPEND _test_args "-DCMAKE_${lang}_FLAGS=${CMAKE_${lang}_FLAGS}" ) + endif() + endforeach() + if( CMAKE_EXE_LINKER_FLAGS ) + list( APPEND _test_args "-DCMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}" ) + endif() if( NOT HAVE_DOUBLE_PRECISION ) list( APPEND _test_args "-DCOMPONENTS=single" ) endif() From 3786b6b9f5662079f798566a8a7e2ac2cc1b2849 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Mon, 6 May 2024 13:48:33 +0100 Subject: [PATCH 21/48] Remove need for non-standard SIZEOF (#89) * Remove need for non-standard SIZEOF * Switch to STORAGE_SIZE for checking size of ZCLONEA/S(IMLOC)%COMMSBUF --- src/trans/internal/suleg_mod.F90 | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/trans/internal/suleg_mod.F90 b/src/trans/internal/suleg_mod.F90 index 2430e3ad..619666c6 100644 --- a/src/trans/internal/suleg_mod.F90 +++ b/src/trans/internal/suleg_mod.F90 @@ -9,9 +9,6 @@ ! MODULE SULEG_MOD -#ifdef __NEC__ -#define SIZEOF(x) STORAGE_SIZE(x)/KIND(x) -#endif CONTAINS SUBROUTINE SULEG !DEC$ OPTIMIZE:1 @@ -874,10 +871,9 @@ SUBROUTINE SULEG ZCLONEA(IMLOC)%COMMSBUF(1:ICLONELEN) = ZRCVBUTFV(1:ICLONELEN,JSETV) CALL UNPACK_BUTTERFLY_STRUCT(S%FA(IMLOC)%YBUT_STRUCT_A,ZCLONEA(IMLOC)) ENDIF - IF(ALLOCATED(ZCLONEA(IMLOC)%COMMSBUF) ) THEN - IF( SIZEOF(ZCLONEA(IMLOC)%COMMSBUF) > 0 ) DEALLOCATE(ZCLONEA(IMLOC)%COMMSBUF) - ! ZCLONEA(IMLOC)%COMMSBUF=>NULL() - ENDIF + + IF( STORAGE_SIZE(ZCLONEA(IMLOC)%COMMSBUF) > 0 ) DEALLOCATE(ZCLONEA(IMLOC)%COMMSBUF) + IF( ASSOCIATED(S%FA(IMLOC)%RPNMA) .AND. .NOT. S%LKEEPRPNM ) DEALLOCATE(S%FA(IMLOC)%RPNMA) IF( ASSOCIATED(S%FA(IMLOC)%RPNMDA) ) DEALLOCATE(S%FA(IMLOC)%RPNMDA) ENDIF @@ -1157,10 +1153,9 @@ SUBROUTINE SULEG ZCLONES(IMLOC)%COMMSBUF(1:ICLONELEN) = ZRCVBUTFV(1:ICLONELEN,JSETV) CALL UNPACK_BUTTERFLY_STRUCT( S%FA(IMLOC)%YBUT_STRUCT_S,ZCLONES(IMLOC)) ENDIF - IF( ALLOCATED(ZCLONES(IMLOC)%COMMSBUF) ) THEN - IF( SIZEOF(ZCLONES(IMLOC)%COMMSBUF) > 0 ) DEALLOCATE(ZCLONES(IMLOC)%COMMSBUF) - ! ZCLONES(IMLOC)%COMMSBUF=>NULL() - ENDIF + + IF( STORAGE_SIZE(ZCLONES(IMLOC)%COMMSBUF) > 0 ) DEALLOCATE(ZCLONES(IMLOC)%COMMSBUF) + IF( ASSOCIATED(S%FA(IMLOC)%RPNMS) .AND. .NOT. S%LKEEPRPNM ) DEALLOCATE(S%FA(IMLOC)%RPNMS) IF( ASSOCIATED(S%FA(IMLOC)%RPNMDS) ) DEALLOCATE(S%FA(IMLOC)%RPNMDS) ENDIF From 9000e8e354390b74779c381c3eef75ba898693c3 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 7 May 2024 09:07:08 +0000 Subject: [PATCH 22/48] Add _STACK and _HEAP qualifiers to GTOL and LTOG work buffers --- src/trans/internal/trgtol_mod.F90 | 30 +++++++++++++++--------------- src/trans/internal/trltog_mod.F90 | 30 +++++++++++++++--------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/trans/internal/trgtol_mod.F90 b/src/trans/internal/trgtol_mod.F90 index 0d04d288..9ee2da35 100644 --- a/src/trans/internal/trgtol_mod.F90 +++ b/src/trans/internal/trgtol_mod.F90 @@ -292,42 +292,42 @@ SUBROUTINE TRGTOL_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS(:,:) -REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS_HEAP(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR_HEAP(:,:) INTEGER(KIND=JPIM), SAVE :: INRECV_PREV = -1 INTEGER(KIND=JPIM), SAVE :: INSEND_PREV = -1 INTEGER(KIND=JPIM), SAVE :: IRECVCOUNT_PREV = -1 INTEGER(KIND=JPIM), SAVE :: ISENDCOUNT_PREV = -1 -IF ( .NOT. ALLOCATED(ZCOMBUFS) ) THEN - ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) +IF ( .NOT. ALLOCATED(ZCOMBUFS_HEAP) ) THEN + ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND)) ISENDCOUNT_PREV = KSENDCOUNT INSEND_PREV = KNSEND ELSEIF ( KSENDCOUNT .NE. ISENDCOUNT_PREV .OR. KNSEND .NE. INSEND_PREV ) THEN - DEALLOCATE(ZCOMBUFS) - ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + DEALLOCATE(ZCOMBUFS_HEAP) + ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND)) ISENDCOUNT_PREV = KSENDCOUNT INSEND_PREV = KNSEND ENDIF ! Now, force the OS to allocate this shared array right now, not when it starts to be used which is ! an OPEN-MP loop, that would cause a threads synchronization lock : -IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS(-1,1) = HUGE(1._JPRB) +IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS_HEAP(-1,1) = HUGE(1._JPRB) -IF ( .NOT. ALLOCATED(ZCOMBUFR) ) THEN - ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) +IF ( .NOT. ALLOCATED(ZCOMBUFR_HEAP) ) THEN + ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV)) IRECVCOUNT_PREV = KRECVCOUNT INRECV_PREV = KNRECV ELSEIF ( KRECVCOUNT .NE. IRECVCOUNT_PREV .OR. KNRECV .NE. INRECV_PREV ) THEN - DEALLOCATE(ZCOMBUFR) - ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + DEALLOCATE(ZCOMBUFR_HEAP) + ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV)) IRECVCOUNT_PREV = KRECVCOUNT INRECV_PREV = KNRECV ENDIF CALL TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& - & ZCOMBUFS,ZCOMBUFR, & + & ZCOMBUFS_HEAP,ZCOMBUFR_HEAP, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2) END SUBROUTINE TRGTOL_COMM_HEAP @@ -364,12 +364,12 @@ SUBROUTINE TRGTOL_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRB) :: ZCOMBUFS(-1:KSENDCOUNT,KNSEND) -REAL(KIND=JPRB) :: ZCOMBUFR(-1:KRECVCOUNT,KNRECV) +REAL(KIND=JPRB) :: ZCOMBUFS_STACK(-1:KSENDCOUNT,KNSEND) +REAL(KIND=JPRB) :: ZCOMBUFR_STACK(-1:KRECVCOUNT,KNRECV) CALL TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& - & ZCOMBUFS,ZCOMBUFR, & + & ZCOMBUFS_STACK,ZCOMBUFR_STACK, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2) END SUBROUTINE TRGTOL_COMM_STACK diff --git a/src/trans/internal/trltog_mod.F90 b/src/trans/internal/trltog_mod.F90 index 4dbe2f90..ef0e06c5 100644 --- a/src/trans/internal/trltog_mod.F90 +++ b/src/trans/internal/trltog_mod.F90 @@ -315,42 +315,42 @@ SUBROUTINE TRLTOG_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& INTEGER(KIND=JPIM), INTENT(IN) :: KSETWL(NPROC) INTEGER(KIND=JPIM), INTENT(IN) :: KSETVL(NPROC) -REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS(:,:) -REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS_HEAP(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR_HEAP(:,:) INTEGER(KIND=JPIM), SAVE :: INRECV_PREV = -1 INTEGER(KIND=JPIM), SAVE :: INSEND_PREV = -1 INTEGER(KIND=JPIM), SAVE :: IRECVCOUNT_PREV = -1 INTEGER(KIND=JPIM), SAVE :: ISENDCOUNT_PREV = -1 -IF ( .NOT. ALLOCATED(ZCOMBUFS) ) THEN - ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) +IF ( .NOT. ALLOCATED(ZCOMBUFS_HEAP) ) THEN + ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND)) ISENDCOUNT_PREV = KSENDCOUNT INSEND_PREV = KNSEND ELSEIF ( KSENDCOUNT .NE. ISENDCOUNT_PREV .OR. KNSEND .NE. INSEND_PREV ) THEN - DEALLOCATE(ZCOMBUFS) - ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + DEALLOCATE(ZCOMBUFS_HEAP) + ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND)) ISENDCOUNT_PREV = KSENDCOUNT INSEND_PREV = KNSEND ENDIF ! Now, force the OS to allocate this shared array right now, not when it starts to be used which is ! an OPEN-MP loop, that would cause a threads synchronization lock : -IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS(-1,1) = HUGE(1._JPRB) +IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS_HEAP(-1,1) = HUGE(1._JPRB) -IF ( .NOT. ALLOCATED(ZCOMBUFR) ) THEN - ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) +IF ( .NOT. ALLOCATED(ZCOMBUFR_HEAP) ) THEN + ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV)) IRECVCOUNT_PREV = KRECVCOUNT INRECV_PREV = KNRECV ELSEIF ( KRECVCOUNT .NE. IRECVCOUNT_PREV .OR. KNRECV .NE. INRECV_PREV ) THEN - DEALLOCATE(ZCOMBUFR) - ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + DEALLOCATE(ZCOMBUFR_HEAP) + ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV)) IRECVCOUNT_PREV = KRECVCOUNT INRECV_PREV = KNRECV ENDIF CALL TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& - & ZCOMBUFS,ZCOMBUFR, & + & ZCOMBUFS_HEAP,ZCOMBUFR_HEAP, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, & & KSETAL, KSETBL,KSETWL,KSETVL) @@ -393,12 +393,12 @@ SUBROUTINE TRLTOG_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& INTEGER(KIND=JPIM), INTENT(IN) :: KSETWL(NPROC) INTEGER(KIND=JPIM), INTENT(IN) :: KSETVL(NPROC) -REAL(KIND=JPRB) :: ZCOMBUFS(-1:KSENDCOUNT,KNSEND) -REAL(KIND=JPRB) :: ZCOMBUFR(-1:KRECVCOUNT,KNRECV) +REAL(KIND=JPRB) :: ZCOMBUFS_STACK(-1:KSENDCOUNT,KNSEND) +REAL(KIND=JPRB) :: ZCOMBUFR_STACK(-1:KRECVCOUNT,KNRECV) CALL TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& - & ZCOMBUFS,ZCOMBUFR, & + & ZCOMBUFS_STACK,ZCOMBUFR_STACK, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, & & KSETAL, KSETBL,KSETWL,KSETVL) From 87cac04447f242757d995bf17ddd4e4b95763a57 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 22 May 2024 15:52:28 +0000 Subject: [PATCH 23/48] Remove nspecresmin --- src/programs/ectrans-benchmark.F90 | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index e3f2085c..1c6a0be5 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -156,7 +156,6 @@ program transform_test integer(kind=jpim) :: nprgpew ! Grid-point decomp integer(kind=jpim) :: nprtrv = 0 ! Spectral decomp integer(kind=jpim) :: nprtrw = 0 ! Spectral decomp -integer(kind=jpim) :: nspecresmin = 80 ! Minimum spectral resolution, for controlling nprtrw integer(kind=jpim) :: mysetv integer(kind=jpim) :: mysetw integer(kind=jpim) :: mp_type = 2 ! Message passing type @@ -290,35 +289,26 @@ program transform_test endif enddo -! From sumpini, although this should be specified in namelist -if (nspecresmin == 0) nspecresmin = nproc - ! Compute nprtrv and nprtrw if not provided on the command line if (nprtrv > 0 .or. nprtrw > 0) then if (nprtrv == 0) nprtrv = nproc/nprtrw if (nprtrw == 0) nprtrw = nproc/nprtrv if (nprtrw*nprtrv /= nproc) call abor1('transform_test:nprtrw*nprtrv /= nproc') - if (nprtrw > nspecresmin) call abor1('transform_test:nprtrw > nspecresmin') else do jprtrv = 4, nproc nprtrv = jprtrv nprtrw = nproc/nprtrv if (nprtrv*nprtrw /= nproc) cycle if (nprtrv > nprtrw) exit - if (nprtrw > nspecresmin) cycle - if (nprtrw <= nspecresmin/(2*oml_max_threads())) exit enddo ! Go for approx square partition for backup - if (nprtrv*nprtrw /= nproc .or. nprtrw > nspecresmin .or. nprtrv > nprtrw) then + if (nprtrv*nprtrw /= nproc .or. nprtrv > nprtrw) then isqr = int(sqrt(real(nproc,jprb))) do ja = isqr, nproc ib = nproc/ja if (ja*ib == nproc) then nprtrw = max(ja, ib) nprtrv = min(ja, ib) - if (nprtrw > nspecresmin ) then - call abor1('transform_test:nprtrw (approx square value) > nspecresmin') - endif exit endif enddo From 73ab525efd49eddfc82897c0872fd053baf36f04 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 4 Jun 2024 09:58:09 +0000 Subject: [PATCH 24/48] Remove fft992 and Bluestein FFT functionality This makes FFTW a required package. --- CMakeLists.txt | 5 +- src/programs/ectrans-benchmark.F90 | 3 +- src/trans/CMakeLists.txt | 25 ++++----- src/trans/external/setup_trans.F90 | 24 +------- src/trans/external/trans_end.F90 | 11 ---- src/trans/include/ectrans/setup_trans.h | 4 +- src/trans/internal/dealloc_resol_mod.F90 | 28 +--------- src/trans/internal/ftdir_mod.F90 | 40 ++------------ src/trans/internal/ftdirad_mod.F90 | 34 ++---------- src/trans/internal/ftinv_mod.F90 | 35 ++---------- src/trans/internal/ftinvad_mod.F90 | 39 ++----------- src/trans/internal/set_resol_mod.F90 | 7 --- src/trans/internal/sufft_mod.F90 | 70 ++---------------------- src/trans/internal/tpm_fft.F90 | 37 ------------- src/trans/internal/tpm_fftw.F90 | 3 +- src/trans/sedrenames.txt | 3 - src/transi/transi_module.F90 | 22 +------- 17 files changed, 38 insertions(+), 352 deletions(-) delete mode 100644 src/trans/internal/tpm_fft.F90 diff --git a/CMakeLists.txt b/CMakeLists.txt index 15dabc14..d293b207 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,10 +55,7 @@ if( NOT HAVE_MKL ) option( FFTW_ENABLE_MKL OFF ) endif() -ecbuild_add_option( FEATURE FFTW - DEFAULT ON - DESCRIPTION "Support for fftw" - REQUIRED_PACKAGES "FFTW COMPONENTS double ${single}" ) +ecbuild_find_package( NAME FFTW REQUIRED COMPONENTS double ${single} ) ecbuild_add_option( FEATURE TRANSI DEFAULT ON diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index e3f2085c..a2a6f737 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -124,7 +124,6 @@ program transform_test logical :: lstatscpu = .false. logical :: lstats_mem = .false. logical :: lxml_stats = .false. -logical :: lfftw = .true. ! Use FFTW for Fourier transforms logical :: lvordiv = .false. logical :: lscders = .false. logical :: luvders = .false. @@ -396,7 +395,7 @@ program transform_test call gstats(2, 0) call setup_trans(ksmax=nsmax, kdgl=ndgl, kloen=nloen, ldsplit=.true., & - & ldusefftw=lfftw, lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & + & lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & & lduseflt=luseflt) call gstats(2, 1) diff --git a/src/trans/CMakeLists.txt b/src/trans/CMakeLists.txt index f12c632a..38c2ad47 100644 --- a/src/trans/CMakeLists.txt +++ b/src/trans/CMakeLists.txt @@ -190,10 +190,6 @@ function(generate_backend_sources) external/* QUIET ) - if( NOT HAVE_FFTW ) - ecbuild_list_exclude_pattern( LIST files REGEX tpm_fftw.F90 ) - endif() - set(outfiles) foreach(file_i ${files}) @@ -235,20 +231,19 @@ foreach( prec dp sp ) ) target_link_libraries( ectrans_${prec} PUBLIC fiat) - if( HAVE_FFTW ) - set( FFTW_LINK PRIVATE ) - if( LAPACK_LIBRARIES MATCHES "mkl" AND NOT FFTW_LIBRARIES MATCHES "mkl" ) - ecbuild_warn( "Danger: Both MKL and FFTW are linked in trans_${prec}. " - "No guarantees on link order can be made for the final executable.") - set( FFTW_LINK PUBLIC ) # Attempt anyway to give FFTW precedence - endif() - ecbuild_debug("target_link_libraries( trans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} )") - target_link_libraries( ectrans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} ) - target_include_directories( ectrans_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) - target_compile_definitions( ectrans_${prec} PRIVATE WITH_FFTW ) + set( FFTW_LINK PRIVATE ) + if( LAPACK_LIBRARIES MATCHES "mkl" AND NOT FFTW_LIBRARIES MATCHES "mkl" ) + ecbuild_warn( "Danger: Both MKL and FFTW are linked in trans_${prec}. " + "No guarantees on link order can be made for the final executable.") + set( FFTW_LINK PUBLIC ) # Attempt anyway to give FFTW precedence endif() + ecbuild_debug("target_link_libraries( trans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} )") + target_link_libraries( ectrans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} ) + target_include_directories( ectrans_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) + target_compile_definitions( ectrans_${prec} PRIVATE WITH_FFTW ) ecbuild_debug("target_link_libraries( ectrans_${prec} PRIVATE ${LAPACK_LIBRARIES} )") target_link_libraries( ectrans_${prec} PRIVATE ${LAPACK_LIBRARIES} ) + if( HAVE_OMP ) ecbuild_debug("target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran )") target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran ) diff --git a/src/trans/external/setup_trans.F90 b/src/trans/external/setup_trans.F90 index 171c2c98..ce1dd482 100644 --- a/src/trans/external/setup_trans.F90 +++ b/src/trans/external/setup_trans.F90 @@ -10,7 +10,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& &KTMAX,KRESOL,PWEIGHT,LDGRIDONLY,LDUSERPNM,LDKEEPRPNM,LDUSEFLT,& -&LDSPSETUPONLY,LDPNMONLY,LDUSEFFTW,& +&LDSPSETUPONLY,LDPNMONLY,& &LDLL,LDSHIFTLL,CDIO_LEGPOL,CDLEGPOLFNAME,KLEGPOLPTR,KLEGPOLPTR_LEN) !**** *SETUP_TRANS* - Setup transform package for specific resolution @@ -53,7 +53,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! LDKEEPRPNM - Keep Legendre Polynomials (only applicable when using ! FLT, otherwise always kept) ! LDPNMONLY - Compute the Legendre polynomials only, not the FFTs. -! LDUSEFFTW - Use FFTW for FFTs ! LDLL - Setup second set of input/output latitudes ! the number of input/output latitudes to transform is equal KDGL ! or KDGL+2 in the case that includes poles + equator @@ -109,10 +108,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE TPM_DISTR ,ONLY : D, DISTR_RESOL,NPROC USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL USE TPM_FIELDS ,ONLY : FIELDS_RESOL -USE TPM_FFT ,ONLY : FFT_RESOL, FFTB_RESOL -#ifdef WITH_FFTW USE TPM_FFTW ,ONLY : TW, FFTW_RESOL -#endif USE TPM_FLT ,ONLY : S, FLT_RESOL USE TPM_CTL ,ONLY : C, CTL_RESOL @@ -147,7 +143,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& LOGICAL ,OPTIONAL,INTENT(IN):: LDKEEPRPNM LOGICAL ,OPTIONAL,INTENT(IN):: LDSPSETUPONLY LOGICAL ,OPTIONAL,INTENT(IN):: LDPNMONLY -LOGICAL ,OPTIONAL,INTENT(IN):: LDUSEFFTW LOGICAL ,OPTIONAL,INTENT(IN):: LDLL LOGICAL ,OPTIONAL,INTENT(IN):: LDSHIFTLL CHARACTER(LEN=*),OPTIONAL,INTENT(IN):: CDIO_LEGPOL @@ -182,11 +177,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ALLOCATE(FIELDS_RESOL(NMAX_RESOL)) ALLOCATE(GEOM_RESOL(NMAX_RESOL)) ALLOCATE(DISTR_RESOL(NMAX_RESOL)) - ALLOCATE(FFT_RESOL(NMAX_RESOL)) - ALLOCATE(FFTB_RESOL(NMAX_RESOL)) -#ifdef WITH_FFTW ALLOCATE(FFTW_RESOL(NMAX_RESOL)) -#endif ALLOCATE(FLT_RESOL(NMAX_RESOL)) ALLOCATE(CTL_RESOL(NMAX_RESOL)) GEOM_RESOL(:)%LAM=.FALSE. @@ -227,9 +218,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& S%LUSE_BELUSOV=.TRUE. ! use Belusov algorithm to compute RPNM array instead of per m S%LKEEPRPNM=.FALSE. ! Keep Legendre polonomials (RPNM) S%LUSEFLT=.FALSE. ! Use fast legendre transforms -#ifdef WITH_FFTW -TW%LFFTW=.FALSE. ! Use FFTW interface for FFTs -#endif LLSPSETUPONLY = .FALSE. ! Only create distributed spectral setup S%LDLL = .FALSE. ! use mapping to/from second set of latitudes S%LSHIFTLL = .FALSE. ! shift output lat-lon by 0.5dx, 0.5dy @@ -334,16 +322,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& D%LCPNMONLY=LDPNMONLY ENDIF - -#ifdef WITH_FFTW -IF(PRESENT(LDUSEFFTW)) THEN - TW%LFFTW=LDUSEFFTW -ENDIF -IF( LLSPSETUPONLY .OR. D%LGRIDONLY ) THEN - TW%LFFTW = .FALSE. -ENDIF -#endif - S%LSOUTHPNM=.FALSE. IF(PRESENT(PSTRET)) THEN IF (ABS(PSTRET-1.0_JPRD)>100._JPRD*EPSILON(1._JPRD)) THEN diff --git a/src/trans/external/trans_end.F90 b/src/trans/external/trans_end.F90 index 76900384..93ba42c0 100644 --- a/src/trans/external/trans_end.F90 +++ b/src/trans/external/trans_end.F90 @@ -50,11 +50,8 @@ SUBROUTINE TRANS_END(CDMODE) USE TPM_DISTR ,ONLY : D, DISTR_RESOL, NPRCIDS USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL USE TPM_FIELDS ,ONLY : F, FIELDS_RESOL -USE TPM_FFT ,ONLY : T, FFT_RESOL, TB, FFTB_RESOL USE TPM_CTL ,ONLY : C, CTL_RESOL -#ifdef WITH_FFTW USE TPM_FFTW ,ONLY : TW, FFTW_RESOL -#endif USE TPM_FLT ,ONLY : S, FLT_RESOL USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN @@ -88,17 +85,9 @@ SUBROUTINE TRANS_END(CDMODE) NULLIFY(D) IF( ALLOCATED(DISTR_RESOL) ) DEALLOCATE(DISTR_RESOL) - !TPM_FFT - NULLIFY(T) - IF( ALLOCATED(FFT_RESOL) ) DEALLOCATE(FFT_RESOL) - NULLIFY(TB) - IF( ALLOCATED(FFTB_RESOL) ) DEALLOCATE(FFTB_RESOL) - -#ifdef WITH_FFTW !TPM_FFTW NULLIFY(TW) IF( ALLOCATED(FFTW_RESOL) ) DEALLOCATE(FFTW_RESOL) -#endif !TPM_FLT NULLIFY(S) diff --git a/src/trans/include/ectrans/setup_trans.h b/src/trans/include/ectrans/setup_trans.h index 0889e0d4..aa8dd833 100644 --- a/src/trans/include/ectrans/setup_trans.h +++ b/src/trans/include/ectrans/setup_trans.h @@ -11,7 +11,7 @@ INTERFACE SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& &KTMAX,KRESOL,PWEIGHT,LDGRIDONLY,LDUSERPNM,LDKEEPRPNM,LDUSEFLT,& -&LDSPSETUPONLY,LDPNMONLY,LDUSEFFTW,& +&LDSPSETUPONLY,LDPNMONLY,& &LDLL,LDSHIFTLL,CDIO_LEGPOL,CDLEGPOLFNAME,KLEGPOLPTR,KLEGPOLPTR_LEN) !**** *SETUP_TRANS* - Setup transform package for specific resolution @@ -51,7 +51,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! LDKEEPRPNM - Keep Legendre Polynomials (only applicable when using ! FLT, otherwise always kept) ! LDPNMONLY - Compute the Legendre polynomialsonly, not the FFTs. -! LDUSEFFTW - Use FFTW for FFTs ! LDLL - Setup second set of input/output latitudes ! the number of input/output latitudes to transform is equal KDGL ! or KDGL+2 in the case that includes poles + equator @@ -101,7 +100,6 @@ LOGICAL ,OPTIONAL,INTENT(IN):: LDUSERPNM LOGICAL ,OPTIONAL,INTENT(IN):: LDKEEPRPNM LOGICAL ,OPTIONAL,INTENT(IN):: LDPNMONLY LOGICAL ,OPTIONAL,INTENT(IN):: LDSPSETUPONLY -LOGICAL ,OPTIONAL,INTENT(IN):: LDUSEFFTW LOGICAL ,OPTIONAL,INTENT(IN):: LDLL LOGICAL ,OPTIONAL,INTENT(IN):: LDSHIFTLL CHARACTER(LEN=*),OPTIONAL,INTENT(IN):: CDIO_LEGPOL diff --git a/src/trans/internal/dealloc_resol_mod.F90 b/src/trans/internal/dealloc_resol_mod.F90 index 8f708f31..ac824b32 100644 --- a/src/trans/internal/dealloc_resol_mod.F90 +++ b/src/trans/internal/dealloc_resol_mod.F90 @@ -48,11 +48,7 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) USE TPM_DISTR ,ONLY : D,NPRTRV USE TPM_GEOMETRY ,ONLY : G USE TPM_FIELDS ,ONLY : F -USE TPM_FFT ,ONLY : T, TB -USE BLUESTEIN_MOD ,ONLY : BLUESTEIN_TERM -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW,DESTROY_PLANS_FFTW -#endif +USE TPM_FFTW ,ONLY : DESTROY_PLANS_FFTW USE TPM_FLT ,ONLY : S USE TPM_CTL ,ONLY : C USE SEEFMM_MIX ,ONLY : FREE_SEEFMM @@ -147,28 +143,8 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) IF(ALLOCATED(D%MSTABF)) DEALLOCATE(D%MSTABF) IF(ALLOCATED(D%NSTAGTF)) DEALLOCATE(D%NSTAGTF) - !TPM_FFT - IF (.NOT.D%LCPNMONLY) THEN - IF( ASSOCIATED(T) ) THEN - IF( ALLOCATED(T%TRIGS) ) DEALLOCATE(T%TRIGS) - IF( ALLOCATED(T%NFAX) ) DEALLOCATE(T%NFAX) - IF( ALLOCATED(T%LUSEFFT992)) DEALLOCATE(T%LUSEFFT992) - ENDIF - IF( ASSOCIATED(TB) ) THEN - IF( T%LBLUESTEIN )THEN - CALL BLUESTEIN_TERM(TB) - T%LBLUESTEIN = .FALSE. - ENDIF - ENDIF - ENDIF - -#ifdef WITH_FFTW !TPM_FFTW - IF( TW%LFFTW )THEN - CALL DESTROY_PLANS_FFTW - ENDIF -#endif - + CALL DESTROY_PLANS_FFTW !TPM_FIELDS IF(ALLOCATED(F%RMU)) DEALLOCATE(F%RMU) diff --git a/src/trans/internal/ftdir_mod.F90 b/src/trans/internal/ftdir_mod.F90 index 520aaae9..48ecc9a3 100644 --- a/src/trans/internal/ftdir_mod.F90 +++ b/src/trans/internal/ftdir_mod.F90 @@ -28,7 +28,7 @@ SUBROUTINE FTDIR(PREEL,KFIELDS,KGL) ! Method. ! ------- -! Externals. FFT992 - FFT routine +! Externals. FFTW - FFT routine ! ---------- ! @@ -50,11 +50,7 @@ SUBROUTINE FTDIR(PREEL,KFIELDS,KGL) USE TPM_DISTR ,ONLY : D, MYSETW USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFT ,ONLY : T, TB -USE BLUESTEIN_MOD ,ONLY : BLUESTEIN_FFT -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW, EXEC_FFTW -#endif +USE TPM_FFTW ,ONLY : EXEC_FFTW USE TPM_DIM ,ONLY : R ! @@ -63,14 +59,13 @@ SUBROUTINE FTDIR(PREEL,KFIELDS,KGL) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL REAL(KIND=JPRB), POINTER, CONTIGUOUS, INTENT(INOUT) :: PREEL(:,:) -INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,IST1 +INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,JJ,IST1 INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN, ITYPE LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time ! ------------------------------------------------------------------ ITYPE=-1 -IJUMP= 1 IGLG = D%NPTRLS(MYSETW)+KGL-1 IST = 2*(G%NMEN(IGLG)+1)+1 ILEN = G%NLOEN(IGLG)+R%NNOEXTZL+3-IST @@ -80,34 +75,7 @@ SUBROUTINE FTDIR(PREEL,KFIELDS,KGL) IRLEN=G%NLOEN(IGLG)+R%NNOEXTZL ICLEN=(IRLEN/2+1)*2 -#ifdef WITH_FFTW - IF( .NOT. TW%LFFTW )THEN -#endif - - IF( T%LUSEFFT992(KGL) )THEN - - CALL FFT992(PREEL(:,IOFF:),T%TRIGS(1,KGL),& - &T%NFAX(1,KGL),KFIELDS,IJUMP,IRLEN,KFIELDS,ITYPE) - - ELSE - - CALL BLUESTEIN_FFT(TB,IRLEN,ITYPE,KFIELDS,PREEL(1:KFIELDS,IOFF:IOFF+ICLEN-1)) - DO JJ=1,ICLEN - DO JF=1,KFIELDS - PREEL(JF,IOFF+JJ-1)=PREEL(JF,IOFF+JJ-1)/REAL(IRLEN,JPRB) - ENDDO - ENDDO - - ENDIF - -#ifdef WITH_FFTW - ELSE - - CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) - - ENDIF -#endif - + CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) ENDIF IST1=1 diff --git a/src/trans/internal/ftdirad_mod.F90 b/src/trans/internal/ftdirad_mod.F90 index e3fef0f2..4d7471f0 100644 --- a/src/trans/internal/ftdirad_mod.F90 +++ b/src/trans/internal/ftdirad_mod.F90 @@ -28,7 +28,7 @@ SUBROUTINE FTDIRAD(PREEL,KFIELDS,KGL) ! Method. ! ------- -! Externals. FFT992 - FFT routine +! Externals. FFTW - FFT routine ! ---------- ! @@ -49,11 +49,7 @@ SUBROUTINE FTDIRAD(PREEL,KFIELDS,KGL) USE TPM_DISTR ,ONLY : D, MYSETW USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFT ,ONLY : T, TB -USE BLUESTEIN_MOD ,ONLY : BLUESTEIN_FFT -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW, EXEC_FFTW -#endif +USE TPM_FFTW ,ONLY : EXEC_FFTW USE TPM_DIM ,ONLY : R IMPLICIT NONE @@ -61,14 +57,13 @@ SUBROUTINE FTDIRAD(PREEL,KFIELDS,KGL) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL REAL(KIND=JPRB), INTENT(INOUT) :: PREEL(:,:) -INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,ILOEN +INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,JJ,JF,ILOEN INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN,ITYPE REAL(KIND=JPRB) :: ZMUL LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time ! ------------------------------------------------------------------ ITYPE = 1 -IJUMP = 1 IGLG = D%NPTRLS(MYSETW)+KGL-1 IST = 2*(G%NMEN(IGLG)+1)+1 ILOEN = G%NLOEN(IGLG)+R%NNOEXTZL @@ -83,28 +78,7 @@ SUBROUTINE FTDIRAD(PREEL,KFIELDS,KGL) ENDDO ENDDO -#ifdef WITH_FFTW -IF( .NOT. TW%LFFTW )THEN -#endif - - IF( T%LUSEFFT992(KGL) )THEN - - CALL FFT992(PREEL(:,IOFF:),T%TRIGS(1,KGL),& - &T%NFAX(1,KGL),KFIELDS,IJUMP,ILOEN,KFIELDS,ITYPE) - - ELSE - - CALL BLUESTEIN_FFT(TB,ILOEN,ITYPE,KFIELDS,PREEL(1:KFIELDS,IOFF:IOFF+ICLEN-1)) - - ENDIF - -#ifdef WITH_FFTW -ELSE - - CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) - -ENDIF -#endif +CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) ! Change of metric (not in forward routine) diff --git a/src/trans/internal/ftinv_mod.F90 b/src/trans/internal/ftinv_mod.F90 index fec1714e..00c34107 100644 --- a/src/trans/internal/ftinv_mod.F90 +++ b/src/trans/internal/ftinv_mod.F90 @@ -27,7 +27,7 @@ SUBROUTINE FTINV(PREEL,KFIELDS,KGL) ! Method. ! ------- -! Externals. FFT992 - FFT routine +! Externals. FFTW - FFT routine ! ---------- ! @@ -48,11 +48,7 @@ SUBROUTINE FTINV(PREEL,KFIELDS,KGL) USE TPM_DISTR ,ONLY : D, MYSETW USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFT ,ONLY : T, TB -USE BLUESTEIN_MOD ,ONLY : BLUESTEIN_FFT -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW, EXEC_FFTW -#endif +USE TPM_FFTW ,ONLY : EXEC_FFTW USE TPM_DIM ,ONLY : R IMPLICIT NONE @@ -60,14 +56,13 @@ SUBROUTINE FTINV(PREEL,KFIELDS,KGL) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL REAL(KIND=JPRB), INTENT(INOUT) :: PREEL(:,:) -INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,IST1 +INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,JJ,JF,IST1 INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN, ITYPE LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time ! ------------------------------------------------------------------ ITYPE = 1 -IJUMP = 1 IGLG = D%NPTRLS(MYSETW)+KGL-1 IST = 2*(G%NMEN(IGLG)+1)+1 ILEN = G%NLOEN(IGLG)+R%NNOEXTZL+3-IST @@ -85,29 +80,7 @@ SUBROUTINE FTINV(PREEL,KFIELDS,KGL) IRLEN=G%NLOEN(IGLG)+R%NNOEXTZL ICLEN=(IRLEN/2+1)*2 -#ifdef WITH_FFTW - IF( .NOT. TW%LFFTW )THEN -#endif - - IF( T%LUSEFFT992(KGL) )THEN - - CALL FFT992(PREEL(:,IOFF:),T%TRIGS(1,KGL),& - &T%NFAX(1,KGL),KFIELDS,IJUMP,IRLEN,KFIELDS,ITYPE) - - ELSE - - CALL BLUESTEIN_FFT(TB,IRLEN,ITYPE,KFIELDS,PREEL(1:KFIELDS,IOFF:IOFF+ICLEN-1)) - - ENDIF - -#ifdef WITH_FFTW - ELSE - - CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) - - ENDIF -#endif - + CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) ENDIF ! ------------------------------------------------------------------ diff --git a/src/trans/internal/ftinvad_mod.F90 b/src/trans/internal/ftinvad_mod.F90 index 5ffac0e9..7e5c2dde 100644 --- a/src/trans/internal/ftinvad_mod.F90 +++ b/src/trans/internal/ftinvad_mod.F90 @@ -28,7 +28,7 @@ SUBROUTINE FTINVAD(PREEL,KFIELDS,KGL) ! Method. ! ------- -! Externals. FFT992 - FFT routine +! Externals. FFTW - FFT routine ! ---------- ! @@ -50,11 +50,7 @@ SUBROUTINE FTINVAD(PREEL,KFIELDS,KGL) USE TPM_DISTR ,ONLY : D, MYSETW USE TPM_DIM ,ONLY : R USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFT ,ONLY : T, TB -USE BLUESTEIN_MOD ,ONLY : BLUESTEIN_FFT -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW, EXEC_FFTW -#endif +USE TPM_FFTW ,ONLY : EXEC_FFTW USE TPM_DIM ,ONLY : R ! @@ -63,14 +59,13 @@ SUBROUTINE FTINVAD(PREEL,KFIELDS,KGL) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL REAL(KIND=JPRB), INTENT(OUT) :: PREEL(:,:) -INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,ILOEN +INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,JJ,JF,ILOEN INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN,ITYPE LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time ! ------------------------------------------------------------------ ITYPE =-1 -IJUMP = 1 IGLG = D%NPTRLS(MYSETW)+KGL-1 ILOEN = G%NLOEN(IGLG)+R%NNOEXTZL IST = 2*(G%NMEN(IGLG)+1)+1 @@ -85,34 +80,8 @@ SUBROUTINE FTINVAD(PREEL,KFIELDS,KGL) PREEL(JF,IOFF-1+JJ) = PREEL(JF,IOFF-1+JJ)*ILOEN ENDDO ENDDO - -#ifdef WITH_FFTW -IF( .NOT. TW%LFFTW )THEN -#endif - IF( T%LUSEFFT992(KGL) )THEN - - CALL FFT992(PREEL(:,IOFF:),T%TRIGS(1,KGL),& - &T%NFAX(1,KGL),KFIELDS,IJUMP,ILOEN,KFIELDS,ITYPE) - - ELSE - - CALL BLUESTEIN_FFT(TB,ILOEN,ITYPE,KFIELDS,PREEL(1:KFIELDS,IOFF:IOFF+ICLEN-1)) - DO JJ=1,ICLEN - DO JF=1,KFIELDS - PREEL(JF,IOFF-1+JJ)=PREEL(JF,IOFF-1+JJ)/REAL(ILOEN,JPRB) - ENDDO - ENDDO - - ENDIF - -#ifdef WITH_FFTW -ELSE - - CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) - -ENDIF -#endif +CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) DO JJ=1,ILEN DO JF=1,KFIELDS diff --git a/src/trans/internal/set_resol_mod.F90 b/src/trans/internal/set_resol_mod.F90 index 2bf8f70d..315799b9 100644 --- a/src/trans/internal/set_resol_mod.F90 +++ b/src/trans/internal/set_resol_mod.F90 @@ -18,10 +18,7 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) USE TPM_DISTR ,ONLY : D, DISTR_RESOL USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL USE TPM_FIELDS ,ONLY : F, FIELDS_RESOL -USE TPM_FFT ,ONLY : T, FFT_RESOL, TB, FFTB_RESOL -#ifdef WITH_FFTW USE TPM_FFTW ,ONLY : TW, FFTW_RESOL -#endif USE TPM_FLT ,ONLY : S, FLT_RESOL USE TPM_CTL ,ONLY : C, CTL_RESOL USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS @@ -63,11 +60,7 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) F => FIELDS_RESOL(NCUR_RESOL) G => GEOM_RESOL(NCUR_RESOL) D => DISTR_RESOL(NCUR_RESOL) - T => FFT_RESOL(NCUR_RESOL) - TB => FFTB_RESOL(NCUR_RESOL) -#ifdef WITH_FFTW TW => FFTW_RESOL(NCUR_RESOL) -#endif S => FLT_RESOL(NCUR_RESOL) C => CTL_RESOL(NCUR_RESOL) ENDIF diff --git a/src/trans/internal/sufft_mod.F90 b/src/trans/internal/sufft_mod.F90 index dec0e912..cce02ec9 100644 --- a/src/trans/internal/sufft_mod.F90 +++ b/src/trans/internal/sufft_mod.F90 @@ -12,17 +12,12 @@ MODULE SUFFT_MOD CONTAINS SUBROUTINE SUFFT -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND1 ,ONLY : JPIM USE TPM_DIM ,ONLY : R USE TPM_GEN ,ONLY : NOUT, NPRINTLEV -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFT ,ONLY : T, TB -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW, INIT_PLANS_FFTW -#endif -USE BLUESTEIN_MOD ,ONLY : BLUESTEIN_INIT, FFTB_TYPE +USE TPM_DISTR ,ONLY : D +USE TPM_FFTW ,ONLY : INIT_PLANS_FFTW ! IMPLICIT NONE @@ -38,64 +33,7 @@ SUBROUTINE SUFFT LLP2 = NPRINTLEV>1 IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SUFFT ===' -#ifdef WITH_FFTW - IF(TW%LFFTW)THEN - - CALL INIT_PLANS_FFTW(R%NDLON) - - ELSE - - NULLIFY(TW%FFTW_PLANS) -#endif - - ALLOCATE(T%TRIGS(R%NDLON,D%NDGL_FS)) - IF(LLP2)WRITE(NOUT,9) 'T%TRIGS ',SIZE(T%TRIGS),SHAPE(T%TRIGS) - ALLOCATE(T%NFAX(19,D%NDGL_FS)) - IF(LLP2)WRITE(NOUT,9) 'T%NFAX ',SIZE(T%NFAX),SHAPE(T%NFAX) - ALLOCATE(T%LUSEFFT992(D%NDGL_FS)) - IF(LLP2)WRITE(NOUT,9) 'T%LUSEFFT992',SIZE(T%LUSEFFT992),SHAPE(T%LUSEFFT992) - - ! - ! create TRIGS and NFAX for latitude lengths supported by FFT992, - ! that is just with factors 2, 3 or 5 - ! - - T%LBLUESTEIN=.FALSE. - ILATS=0 - DO JGL=1,D%NDGL_FS - IGLG = D%NPTRLS(MYSETW)+JGL-1 - CALL SET99B(T%TRIGS(1,JGL),T%NFAX(1,JGL),G%NLOEN(IGLG),T%LUSEFFT992(JGL)) - IF( .NOT.T%LUSEFFT992(JGL) )THEN - ILATS=ILATS+1 - T%LBLUESTEIN=.TRUE. - ENDIF - ENDDO - - ! - ! we only initialise for bluestein if there are latitude lengths - ! not supported by FFT992 - ! - - IF( T%LBLUESTEIN )THEN - TB%NDLON=R%NDLON - TB%NLAT_COUNT=ILATS - ILATS=0 - ALLOCATE(TB%NLATS(TB%NLAT_COUNT)) - DO JGL=1,D%NDGL_FS - IF( .NOT.T%LUSEFFT992(JGL) )THEN - ILATS=ILATS+1 - IGLG = D%NPTRLS(MYSETW)+JGL-1 - TB%NLATS(ILATS)=G%NLOEN(IGLG) - ENDIF - ENDDO - CALL BLUESTEIN_INIT(TB) - ENDIF - -#ifdef WITH_FFTW - - ENDIF -#endif - + CALL INIT_PLANS_FFTW(R%NDLON) ENDIF ! ------------------------------------------------------------------ diff --git a/src/trans/internal/tpm_fft.F90 b/src/trans/internal/tpm_fft.F90 deleted file mode 100644 index 5577262a..00000000 --- a/src/trans/internal/tpm_fft.F90 +++ /dev/null @@ -1,37 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2000- Meteo-France. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE TPM_FFT -USE PARKIND1 ,ONLY : JPIM ,JPRB -USE BLUESTEIN_MOD ,ONLY : FFTB_TYPE - -! Module for Fourier transforms. - -IMPLICIT NONE - -SAVE - -TYPE FFT_TYPE - REAL(KIND=JPRB) ,ALLOCATABLE :: TRIGS(:,:) ! list of trigonometric function values - INTEGER(KIND=JPIM),ALLOCATABLE :: NFAX(:,:) ! list of factors of truncation - LOGICAL :: LBLUESTEIN=.FALSE. ! logical indicating whether some - ! latitudes require bluestein algorithm - ! with prime factors that are not 2,3,or 5 - LOGICAL,ALLOCATABLE :: LUSEFFT992(:) ! describes which FFT algorithm to be used - ! T=use FFT992 F=use bluestein -END TYPE FFT_TYPE - -TYPE(FFT_TYPE),ALLOCATABLE,TARGET :: FFT_RESOL(:) -TYPE(FFT_TYPE),POINTER :: T - -TYPE(FFTB_TYPE),ALLOCATABLE,TARGET :: FFTB_RESOL(:) -TYPE(FFTB_TYPE),POINTER :: TB - -END MODULE TPM_FFT diff --git a/src/trans/internal/tpm_fftw.F90 b/src/trans/internal/tpm_fftw.F90 index fd81a3dd..f3faef0c 100644 --- a/src/trans/internal/tpm_fftw.F90 +++ b/src/trans/internal/tpm_fftw.F90 @@ -44,8 +44,7 @@ MODULE TPM_FFTW INTEGER(KIND=JPIM),ALLOCATABLE :: N_PLANS(:) TYPE(FFTW_PLAN),POINTER :: FFTW_PLANS(:) INTEGER(KIND=JPIM) :: N_MAX=0 ! maximum number of latitudes - INTEGER(KIND=JPIM) :: N_MAX_PLANS=4 ! maximum number of plans for each active latitude - LOGICAL :: LFFTW=.FALSE. + INTEGER(KIND=JPIM) :: N_MAX_PLANS=4 ! maximum number of plans for each active latitudes END TYPE FFTW_TYPE diff --git a/src/trans/sedrenames.txt b/src/trans/sedrenames.txt index 0f8e934b..63896567 100644 --- a/src/trans/sedrenames.txt +++ b/src/trans/sedrenames.txt @@ -1,5 +1,3 @@ -s/ FFT992_CC/ FFT992_CC_VARIANTDESIGNATOR/g -s/ FFT992( *($|\(| |\*))/ FFT992_VARIANTDESIGNATOR\1/g s/ASRE1_MOD/ASRE1_MOD_VARIANTDESIGNATOR/g s/ASRE1AD_MOD/ASRE1AD_MOD_VARIANTDESIGNATOR/g s/ASRE1B_MOD/ASRE1B_MOD_VARIANTDESIGNATOR/g @@ -115,7 +113,6 @@ s/SPNSDEAD_MOD/SPNSDEAD_MOD_VARIANTDESIGNATOR/g s/SUFFT_MOD/SUFFT_MOD_VARIANTDESIGNATOR/g s/SULEG_MOD/SULEG_MOD_VARIANTDESIGNATOR/g s/SUTRLE_MOD/SUTRLE_MOD_VARIANTDESIGNATOR/g -s/TPM_FFT( *(,|$| ))/TPM_FFT_VARIANTDESIGNATOR\1/g s/TPM_FFTW/TPM_FFTW_VARIANTDESIGNATOR/g s/TPM_FIELDS/TPM_FIELDS_VARIANTDESIGNATOR/g s/TPM_FLT/TPM_FLT_VARIANTDESIGNATOR/g diff --git a/src/transi/transi_module.F90 b/src/transi/transi_module.F90 index 35d3a567..d5e13dd0 100644 --- a/src/transi/transi_module.F90 +++ b/src/transi/transi_module.F90 @@ -129,8 +129,6 @@ module trans_module integer, private, parameter :: MAX_STR_LEN = 1024 -integer, parameter :: FFT992 = 1 -integer, parameter :: FFTW = 2 integer, parameter :: TRANS_SUCCESS = 0 integer, parameter :: TRANS_ERROR = -1 integer, parameter :: TRANS_NOTIMPL = -2 @@ -580,7 +578,7 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) logical, parameter :: lkeeprpnm =.False. logical, parameter :: luserpnm =.False. ! Don't use Belusov algorithm (uses twice the memory) logical :: ldlam ! output - logical :: lgridonly, lsplit, lusefftw !input + logical :: lgridonly, lsplit !input logical :: lspeconly ! only logical :: llatlon ! input logical :: llatlonshift ! input @@ -594,8 +592,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) lsplit = .False. if( trans%lsplit /= 0 ) lsplit = .True. - if( trans%fft == FFTW ) lusefftw = .True. - llatlon = .False. llatlonshift = .False. if( trans%llatlon /= 0 ) llatlon = .True. @@ -682,7 +678,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KDLON=nlon, & & CDIO_LEGPOL="readf", & & CDLEGPOLFNAME=readfp, & @@ -700,7 +695,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KDLON=nlon, & & CDIO_LEGPOL="readf", & & CDLEGPOLFNAME=readfp ) @@ -719,7 +713,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KDLON=nlon, & & CDIO_LEGPOL="writef", & & CDLEGPOLFNAME=writefp, & @@ -737,7 +730,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KDLON=nlon, & & CDIO_LEGPOL="writef", & & CDLEGPOLFNAME=writefp ) @@ -756,7 +748,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KDLON=nlon, & & CDIO_LEGPOL="membuf", & & KLEGPOLPTR=trans%cache, & @@ -775,7 +766,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KDLON=nlon, & & CDIO_LEGPOL="membuf", & & KLEGPOLPTR=trans%cache, & @@ -797,7 +787,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KDLON=nlon, & & LDUSEFLT=luseflt ) @@ -813,7 +802,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KDLON=nlon ) endif @@ -838,7 +826,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KLOEN=nloen, CDIO_LEGPOL="readf", & & CDLEGPOLFNAME=trim(readfp),& & LDUSEFLT=luseflt ) @@ -855,7 +842,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KLOEN=nloen, CDIO_LEGPOL="readf", & & CDLEGPOLFNAME=trim(readfp) ) @@ -874,7 +860,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KLOEN=nloen, & & CDIO_LEGPOL="writef", & & CDLEGPOLFNAME=trim(writefp), & @@ -892,7 +877,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KLOEN=nloen, & & CDIO_LEGPOL="writef", & & CDLEGPOLFNAME=trim(writefp) ) @@ -913,7 +897,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KLOEN=nloen, & & CDIO_LEGPOL="membuf", & & KLEGPOLPTR=trans%cache, & @@ -932,7 +915,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KLOEN=nloen, & & CDIO_LEGPOL="membuf", & & KLEGPOLPTR=trans%cache, & @@ -953,7 +935,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KLOEN=nloen, & & LDUSEFLT=luseflt ) @@ -969,7 +950,6 @@ function trans_setup(trans) bind(C,name="trans_setup") result(iret) & LDSPLIT=LSPLIT, & & LDKEEPRPNM=LKEEPRPNM, & & LDUSERPNM=LUSERPNM, & - & LDUSEFFTW=LUSEFFTW, & & KLOEN=nloen ) endif endif From e6d15511d8272df065ed4ac50f1bf346613f6d2a Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 4 Jun 2024 10:05:48 +0000 Subject: [PATCH 25/48] Absorb SUFFT into SETUP_TRANS This subroutine is a bit pointless if you only have FFTW. --- src/trans/external/setup_trans.F90 | 7 +++-- src/trans/internal/sufft_mod.F90 | 44 ------------------------------ 2 files changed, 4 insertions(+), 47 deletions(-) delete mode 100644 src/trans/internal/sufft_mod.F90 diff --git a/src/trans/external/setup_trans.F90 b/src/trans/external/setup_trans.F90 index ce1dd482..7ebdace8 100644 --- a/src/trans/external/setup_trans.F90 +++ b/src/trans/external/setup_trans.F90 @@ -108,7 +108,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE TPM_DISTR ,ONLY : D, DISTR_RESOL,NPROC USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL USE TPM_FIELDS ,ONLY : FIELDS_RESOL -USE TPM_FFTW ,ONLY : TW, FFTW_RESOL +USE TPM_FFTW ,ONLY : TW, FFTW_RESOL, INIT_PLANS_FFTW USE TPM_FLT ,ONLY : S, FLT_RESOL USE TPM_CTL ,ONLY : C, CTL_RESOL @@ -118,7 +118,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE SUMP_TRANS_PRELEG_MOD ,ONLY : SUMP_TRANS_PRELEG USE SULEG_MOD ,ONLY : SULEG USE PRE_SULEG_MOD ,ONLY : PRE_SULEG -USE SUFFT_MOD ,ONLY : SUFFT USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE SHAREDMEM_MOD ,ONLY : SHAREDMEM_CREATE USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK @@ -389,7 +388,9 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& CALL GSTATS(1802,0) ! Initialize Fast Fourier Transform package - IF (.NOT.D%LCPNMONLY) CALL SUFFT + IF (.NOT. D%LCPNMONLY .AND. .NOT. D%LGRIDONLY) THEN + CALL INIT_PLANS_FFTW(R%NDLON) + ENDIF CALL GSTATS(1802,1) ELSE CALL PRE_SULEG diff --git a/src/trans/internal/sufft_mod.F90 b/src/trans/internal/sufft_mod.F90 deleted file mode 100644 index cce02ec9..00000000 --- a/src/trans/internal/sufft_mod.F90 +++ /dev/null @@ -1,44 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2000- Meteo-France. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE SUFFT_MOD -CONTAINS -SUBROUTINE SUFFT - -USE PARKIND1 ,ONLY : JPIM - -USE TPM_DIM ,ONLY : R -USE TPM_GEN ,ONLY : NOUT, NPRINTLEV -USE TPM_DISTR ,ONLY : D -USE TPM_FFTW ,ONLY : INIT_PLANS_FFTW -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM) :: JGL,IGLG, ILATS -LOGICAL :: LLP1,LLP2 - -! ------------------------------------------------------------------ - -IF(.NOT.D%LGRIDONLY) THEN - - LLP1 = NPRINTLEV>0 - LLP2 = NPRINTLEV>1 - IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SUFFT ===' - - CALL INIT_PLANS_FFTW(R%NDLON) -ENDIF - -! ------------------------------------------------------------------ - -9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) - -END SUBROUTINE SUFFT -END MODULE SUFFT_MOD From 173121138acb1387b1f9d0aef679823e362259fc Mon Sep 17 00:00:00 2001 From: DJDavies2 Date: Thu, 6 Jun 2024 17:37:16 +0100 Subject: [PATCH 26/48] Updates following comments. --- src/programs/ectrans-benchmark.F90 | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index 730d9c69..ba951575 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -710,7 +710,9 @@ program transform_test call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc(1:1)) call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset(1:nflevg)) call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset(1:nflevg)) - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset(1:nflevg)) + if (nfld > 0) then + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset(1:nflevg)) + endif ! Surface pressure if (myproc == 1) then @@ -758,7 +760,9 @@ program transform_test if (lprint_norms .or. ncheck > 0) then call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset) call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset) - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset) + if (nfld > 0) then + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset) + endif call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc) if (myproc == 1) then From 3da552cea83d8f5a302039769abb5883004e8dfa Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Mon, 24 Jun 2024 16:38:20 +0000 Subject: [PATCH 27/48] Remove fft992 and Bluestein algor files --- src/trans/algor/bluestein_mod.F90 | 386 ----- src/trans/algor/fft992.F90 | 2377 ----------------------------- src/trans/algor/fft992_cc.F90 | 139 -- src/trans/algor/set99.F90 | 82 - src/trans/algor/set99b.F90 | 81 - src/trans/sedrenames.txt | 3 - 6 files changed, 3068 deletions(-) delete mode 100644 src/trans/algor/bluestein_mod.F90 delete mode 100644 src/trans/algor/fft992.F90 delete mode 100644 src/trans/algor/fft992_cc.F90 delete mode 100644 src/trans/algor/set99.F90 delete mode 100644 src/trans/algor/set99b.F90 diff --git a/src/trans/algor/bluestein_mod.F90 b/src/trans/algor/bluestein_mod.F90 deleted file mode 100644 index 4d282170..00000000 --- a/src/trans/algor/bluestein_mod.F90 +++ /dev/null @@ -1,386 +0,0 @@ -! (C) Copyright 2015- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE BLUESTEIN_MOD - -! Implementation of the Bluestein FFT algorithm as described in a paper titled -! "Bluestein's FFT for Arbitrary N on the Hypercube", Paul N. Swarztrauber et al., -! Parallel Computing, 17 (1991), pp. 607-617. -! -! George Mozdzynski and Nils Wedi, June 2015 -! -! The naming convention follows the algorithm description in the above paper. -! -USE PARKIND1, ONLY : JPIM, JPRB - -IMPLICIT NONE - -PRIVATE -PUBLIC BLUESTEIN_FFT, BLUESTEIN_INIT, BLUESTEIN_TERM, FFTB_TYPE - -TYPE FFTB_PLAN - INTEGER(KIND=JPIM) :: NSIZE ! latitude length security check - REAL(KIND=JPRB),ALLOCATABLE :: HS(:,:,:) - REAL(KIND=JPRB),ALLOCATABLE :: H2xT(:,:,:) -END TYPE FFTB_PLAN - -TYPE FFTB_TYPE - INTEGER(KIND=JPIM) :: NDLON ! maximum number of points on a latitude - REAL(KIND=JPRB) ,ALLOCATABLE :: TRIGS(:,:) ! list of trigonometric function values (PO2) - INTEGER(KIND=JPIM),ALLOCATABLE :: NFAX(:,:) ! list of factors of truncation (PO2) - INTEGER(KIND=JPIM) :: NLAT_COUNT ! number of lats requiring bluestein FFT - INTEGER(KIND=JPIM),ALLOCATABLE :: NLATS(:) ! the latitude lengths of these latitudes - TYPE(FFTB_PLAN),ALLOCATABLE :: FFTB(:) -END TYPE FFTB_TYPE - -CONTAINS -!----------------------------------------------------------------------------- -SUBROUTINE BLUESTEIN_FFT(TB,N,KSIGN,KLOT,PDAT) -! N : FFT LENGTH -! KSIGN : FFT DIRECTION -! -1 DIRECT (R2C) -! 1 INVERSE (C2R) -IMPLICIT NONE -TYPE(FFTB_TYPE),INTENT(INOUT) :: TB -INTEGER,INTENT(IN) :: N,KSIGN,KLOT -REAL(KIND=JPRB),INTENT(INOUT) :: PDAT (:,:) -REAL(KIND=JPRB),ALLOCATABLE :: ZDATAR(:,:), ZDATAI(:,:),ZY(:,:) -REAL(KIND=JPRB) :: ZR(KLOT),ZI(KLOT),ZX0(KLOT) -REAL(KIND=JPRB) :: ZWR,ZWI -INTEGER(KIND=JPIM) :: I,K,M,JLOT,NN,II,IR,IPO2 -INTEGER(KIND=JPIM) :: IJUMP,ILOT,IINC,ISIGN,IFFTSIGN - -!WRITE(*,'("BLUESTEIN_FFT: N=",I6," KSIGN=",I2," KLOT=",I4)')& -! & N,KSIGN,KLOT - -IF( KSIGN/=-1 .AND. KSIGN/=1 )THEN - CALL ABOR1('BLUESTEIN_FFT: INVALID KSIGN') -ENDIF - -NN=N/2+1 - -IF( TB%FFTB(N)%NSIZE /= N )THEN - WRITE(0,'("BLUESTEIN_FFT: UNEXPECTED PLAN LATITUDE LENGTH, N=",I6," TB%FFTB(N)%NSIZE=",I6)')& - & N,TB%FFTB(N)%NSIZE - CALL ABOR1('BLUESTEIN_FFT: UNEXPECTED PLAN LATITUDE LENGTH') -ENDIF - -IF( KSIGN==-1 )THEN - ISIGN=1 -ELSE - ISIGN=2 -ENDIF - -! input data preparation - -ALLOCATE(ZDATAR(KLOT,0:2*NN-1)) -ALLOCATE(ZDATAI(KLOT,0:2*NN-1)) -ZDATAR(:,:)=0.0D0 -ZDATAI(:,:)=0.0D0 -IF( KSIGN==-1 )THEN - DO K=0,N-1 - DO JLOT=1,KLOT - ZDATAR(JLOT,K)=PDAT(JLOT,K+1) - ENDDO - ENDDO -ELSEIF( KSIGN==1 )THEN - DO JLOT=1,KLOT - DO K=0,NN-1 - ZDATAR(JLOT,K)=PDAT(JLOT,K*2+1) - ZDATAR(JLOT,N-K)=PDAT(JLOT,K*2+1) - ZDATAI(JLOT,K)=PDAT(JLOT,K*2+2) - ZDATAI(JLOT,N-K) = -PDAT(JLOT,K*2+2) - ENDDO - ZDATAI(JLOT,0)=0._JPRB - ENDDO - -ENDIF - -! -! Compute M as the smallest power of two that is greater than or equal to 2N-2 -! and compute the M vector H2 from equations (2.16) -! - -M=1 -IPO2=0 -DO WHILE( M<=2*N-2) - M=M*2 - IPO2=IPO2+1 -ENDDO - -ALLOCATE(ZY(2*KLOT,0:(M/2+1)*2)) - -! create Y by mult with bluestein n**2 - -ZX0(1:KLOT) = ZDATAR(1:KLOT,0) -DO I=0,N-1 - - ZR=ZDATAR(1:KLOT,I) - ZI=ZDATAI(1:KLOT,I) - - ZWR=TB%FFTB(N)%HS(1,I,ISIGN) - ZWI=TB%FFTB(N)%HS(2,I,ISIGN) - - DO K=1,KLOT - ZY((K-1)*2+1,I) = ZR(K)*ZWR + ZI(K)*ZWI - ZY((K-1)*2+2,I) = ZI(K)*ZWR - ZR(K)*ZWI - ENDDO - -ENDDO - -! zero padding of Y - -DO I=N,(M/2+1)*2 - ZY(:,I) = 0._JPRB -ENDDO - -! FFT of Y - -ILOT=2*KLOT -IINC=ILOT -IJUMP=1 -IFFTSIGN=-1 ! R->C - -CALL FFT992(ZY,TB%TRIGS(1,IPO2),TB%NFAX(1,IPO2),IINC,IJUMP,M,ILOT,IFFTSIGN) -CALL FFT992_CC(ZY, IINC, IJUMP, M, ILOT, IFFTSIGN) - -! convert real FFT output, pointwise multiplication with h_hat(n-k) and real/imag -! swap in preparation for inverse FFT - -DO I=0,M-1 - DO K=1,KLOT - ZR(K)=ZY((K-1)*2+1,I) - ZI(K)=ZY((K-1)*2+2,I) - ENDDO - - ZWR=TB%FFTB(N)%H2xT(1,I,ISIGN) - ZWI=TB%FFTB(N)%H2xT(2,I,ISIGN) - -! swap - DO K=1,KLOT - ZY((K-1)*2+1,I) = ZI(K)*ZWR + ZR(K)*ZWI - ZY((K-1)*2+2,I) = ZR(K)*ZWR - ZI(K)*ZWI - ENDDO -ENDDO - -! IFFT as a FFT with swapped input and swapped output - -CALL FFT992(ZY,TB%TRIGS(1,IPO2),TB%NFAX(1,IPO2),IINC,IJUMP,M,ILOT,IFFTSIGN) -CALL FFT992_CC (ZY, IINC, IJUMP, M, ILOT, IFFTSIGN) - -! create final by mult with another bluestein n**2 and swap output of prev FFT -! postprocessing - -IF( KSIGN==-1) THEN - - DO I=0,N/2 - DO K=1,KLOT - ZI(K)=ZY((K-1)*2+1,I) - ZR(K)=ZY((K-1)*2+2,I) - ENDDO - - ZWR=TB%FFTB(N)%HS(1,I,ISIGN) - ZWI=TB%FFTB(N)%HS(2,I,ISIGN) - IR=I*2+1 - II=I*2+2 - DO K=1,KLOT - PDAT(K,IR) = ZR(K)*ZWR + ZI(K)*ZWI - PDAT(K,II) = ZI(K)*ZWR - ZR(K)*ZWI - ENDDO - ENDDO - -ELSE - - DO I=0,N-1 - - DO K=1,KLOT - ZI(K)=ZY((K-1)*2+1,I) - ZR(K)=ZY((K-1)*2+2,I) - ENDDO - - ZWR=TB%FFTB(N)%HS(1,I,ISIGN) - ZWI=TB%FFTB(N)%HS(2,I,ISIGN) - - DO K=1,KLOT - PDAT(K,I+1) = ZR(K)*ZWR + ZI(K)*ZWI - ENDDO - ENDDO - DO K=1,KLOT - PDAT(K,N) =PDAT(K,N) + ZX0(K) - ENDDO - -ENDIF - -DEALLOCATE(ZY) -DEALLOCATE(ZDATAR) -DEALLOCATE(ZDATAI) - -RETURN -END SUBROUTINE BLUESTEIN_FFT - - -!============================================================================= -SUBROUTINE BLUESTEIN_INIT(TB) -! -! Initialize data structures required by Bluestein FFT -! -! -TYPE(FFTB_TYPE),INTENT(INOUT) :: TB -INTEGER(KIND=JPIM) :: N,M,IPO2,JLAT,J,K,ISIGN,KSIGN -INTEGER(KIND=JPIM) :: IJUMP,ILOT,IINC,IFFTSIGN - -LOGICAL :: LLUSEFFT992 -REAL(KIND=JPRB) :: DEL,ANGLE,ZSIGN - -! determine number of PO2 FFT sizes needed by Bluestein FFTs -M=1 -IPO2=0 -DO WHILE( M<=2*TB%NDLON-2) - M=M*2 - IPO2=IPO2+1 -ENDDO - -!WRITE(*,'("BLUESTEIN_INIT: 2*KLON-2=",I5," M=",I5," IPO2=",I2)')2*TB%NDLON-2,M,IPO2 - -! now go and generate the trigs for the above number of PO2 FFT sizes -ALLOCATE(TB%TRIGS(M,IPO2)) -ALLOCATE(TB%NFAX(19,IPO2)) -TB%TRIGS(:,:)=0.0D0 -TB%NFAX (:,:)=0.0D0 - -M=1 -IPO2=0 -DO WHILE( M<=2*TB%NDLON-2) - M=M*2 - IPO2=IPO2+1 - CALL SET99B(TB%TRIGS(1,IPO2),TB%NFAX(1,IPO2),M,LLUSEFFT992) - IF( .NOT.LLUSEFFT992 )THEN - CALL ABOR1("BLUESTEIN_INIT: UNEXPECTED LLUSEFFT992=F") - ENDIF -ENDDO - -ALLOCATE(TB%FFTB(TB%NDLON)) -DO J=1,TB%NDLON - TB%FFTB(J)%NSIZE=-1 -ENDDO - -DO JLAT=1,TB%NLAT_COUNT - - N=TB%NLATS(JLAT) - - IF( TB%FFTB(N)%NSIZE==N )THEN - ! we have already initialised this latitude length - ! WRITE(0,'("BLUESTEIN_INIT: WARNING - LATITUDE LENGTH ",I6," ALREADY INITIALIZED")')N - CYCLE - ENDIF - - IF( N > TB%NDLON )THEN - CALL ABOR1("BLUESTEIN_INIT: N > TB%NDLON UNEXPECTED") - ENDIF - - ! now set the specific PO2 (i.e. M and IPO2) for the N length of - ! this latitude being initialized - M=1 - IPO2=0 - DO WHILE( M<=2*N-2) - M=M*2 - IPO2=IPO2+1 - ENDDO - - TB%FFTB(N)%NSIZE=N - - - DEL=2.0D0*ASIN(1.0D0)/REAL(N,JPRB) - - ALLOCATE(TB%FFTB(N)%HS(2,0:N-1,2)) - ALLOCATE(TB%FFTB(N)%H2xT(2,0:(M/2+1)*2,2)) - - DO ISIGN=1,2 - - IF( ISIGN==1 )THEN - KSIGN=-1 - ELSE - KSIGN= 1 - ENDIF - - ZSIGN=-KSIGN - - ! conjugate bluestein sequence - - DO K=0,N-1 - ANGLE=REAL(K*K,JPRB)*DEL - TB%FFTB(N)%HS(1,K,ISIGN)=COS(ANGLE) - TB%FFTB(N)%HS(2,K,ISIGN)=ZSIGN*SIN(ANGLE) - ENDDO - - DO K=0,(M/2+1)*2 - TB%FFTB(N)%H2xT(1,K,ISIGN) = 0._JPRB - TB%FFTB(N)%H2xT(2,K,ISIGN) = 0._JPRB - ENDDO - TB%FFTB(N)%H2xT(1,0,ISIGN) = TB%FFTB(N)%HS(1,0,ISIGN) - TB%FFTB(N)%H2xT(2,0,ISIGN) = TB%FFTB(N)%HS(2,0,ISIGN) - - DO K=1,N-1 - TB%FFTB(N)%H2xT(1,K,ISIGN) = TB%FFTB(N)%HS(1,K,ISIGN) - TB%FFTB(N)%H2xT(1,M-K,ISIGN) = TB%FFTB(N)%HS(1,K,ISIGN) - TB%FFTB(N)%H2xT(2,K,ISIGN) = TB%FFTB(N)%HS(2,K,ISIGN) - TB%FFTB(N)%H2xT(2,M-K,ISIGN) = TB%FFTB(N)%HS(2,K,ISIGN) - ENDDO - IF( M > 2*N-2 ) THEN - DO K=N,M-N+1 - TB%FFTB(N)%H2xT(1,K,ISIGN) = 0._JPRB - TB%FFTB(N)%H2xT(2,K,ISIGN) = 0._JPRB - ENDDO - ENDIF - - - ! - ! Compute an unnormalized discrete Fourier transform of H2 -> H_hat - ! - ILOT=2 - IINC=ILOT - IJUMP=1 - IFFTSIGN=1 ! C->R - CALL FFT992_CC(TB%FFTB(N)%H2xT(:,:,ISIGN),IINC,IJUMP,M,ILOT,IFFTSIGN) - CALL FFT992(TB%FFTB(N)%H2xT(:,:,ISIGN),TB%TRIGS(1,IPO2),TB%NFAX(1,IPO2),IINC,IJUMP,M,ILOT,IFFTSIGN) - - ENDDO ! ISIGN - -ENDDO ! JLAT - -RETURN -END SUBROUTINE BLUESTEIN_INIT - - -!============================================================================= -SUBROUTINE BLUESTEIN_TERM(TB) -! -! Remove data structures used by Bluestein FFT -! -! -TYPE(FFTB_TYPE),INTENT(INOUT) :: TB -INTEGER(KIND=JPIM) :: N,JLAT - -IF( ALLOCATED(TB%TRIGS) ) DEALLOCATE(TB%TRIGS) -IF( ALLOCATED(TB%NFAX) ) DEALLOCATE(TB%NFAX) -DO JLAT=1,TB%NLAT_COUNT - N=TB%NLATS(JLAT) - IF( ALLOCATED(TB%FFTB(N)%HS) ) DEALLOCATE(TB%FFTB(N)%HS) - IF( ALLOCATED(TB%FFTB(N)%H2xT) ) DEALLOCATE(TB%FFTB(N)%H2xT) -ENDDO -IF( ALLOCATED(TB%NLATS) ) DEALLOCATE(TB%NLATS) -IF( ALLOCATED(TB%FFTB) ) DEALLOCATE(TB%FFTB) - -RETURN -END SUBROUTINE BLUESTEIN_TERM - - -!============================================================================= - -END MODULE BLUESTEIN_MOD diff --git a/src/trans/algor/fft992.F90 b/src/trans/algor/fft992.F90 deleted file mode 100644 index 04ea69e4..00000000 --- a/src/trans/algor/fft992.F90 +++ /dev/null @@ -1,2377 +0,0 @@ -! (C) Copyright 1998- ECMWF. -! (C) Copyright 2013- Meteo-France. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -! -! SUBROUTINE 'FFT992' - MULTIPLE FAST REAL PERIODIC TRANSFORM -! -! Author: Clive Temperton, January 1998 -! -! This routine is a modernized and enhanced version of FFT991 -! - Cray directives and ancient Fortran constructs removed -! - "vector chopping" removed -! - WORK array is now dynamically allocated -! - stride in WORK array is now always 1 -! -! REAL TRANSFORM OF LENGTH N PERFORMED BY REMOVING REDUNDANT -! OPERATIONS FROM COMPLEX TRANSFORM OF LENGTH N -! -! A IS THE ARRAY CONTAINING INPUT & OUTPUT DATA -! TRIGS IS A PREVIOUSLY PREPARED LIST OF TRIG FUNCTION VALUES -! IFAX IS A PREVIOUSLY PREPARED LIST OF FACTORS OF N -! INC IS THE INCREMENT WITHIN EACH DATA 'VECTOR' -! (E.G. INC=1 FOR CONSECUTIVELY STORED DATA) -! JUMP IS THE INCREMENT BETWEEN THE START OF EACH DATA VECTOR -! N IS THE LENGTH OF THE DATA VECTORS -! LOT IS THE NUMBER OF DATA VECTORS -! ISIGN = +1 FOR TRANSFORM FROM SPECTRAL TO GRIDPOINT -! = -1 FOR TRANSFORM FROM GRIDPOINT TO SPECTRAL -! -! ORDERING OF COEFFICIENTS: -! A(0),B(0),A(1),B(1),A(2),B(2),...,A(N/2),B(N/2) -! WHERE B(0)=B(N/2)=0; (N+2) LOCATIONS REQUIRED -! -! ORDERING OF DATA: -! X(0),X(1),X(2),...,X(N-1), 0 , 0 ; (N+2) LOCATIONS REQUIRED -! -! VECTORIZATION IS ACHIEVED BY DOING THE TRANSFORMS IN PARALLEL -! -! N MUST BE COMPOSED OF FACTORS 2,3 & 5 BUT DOES NOT HAVE TO BE EVEN -! -! DEFINITION OF TRANSFORMS: -! ------------------------- -! -! ISIGN=+1: X(J)=SUM(K=0,...,N-1)(C(K)*EXP(2*I*J*K*PI/N)) -! WHERE C(K)=A(K)+I*B(K) AND C(N-K)=A(K)-I*B(K) -! -! ISIGN=-1: A(K)=(1/N)*SUM(J=0,...,N-1)(X(J)*COS(2*J*K*PI/N)) -! B(K)=-(1/N)*SUM(J=0,...,N-1)(X(J)*SIN(2*J*K*PI/N)) -! -#ifdef MATHKEISAN -! MathKeisan is a scientific library optimized for NEC (www.mathkeisan.com) - - SUBROUTINE FFT992(A,TRIGS_,IFAX_,INC,JUMP,N,LOT,ISIGN) -!AUTOPROMOTE - USE PARKIND1, ONLY : JPIM, JPRB - USE YOMHOOK , ONLY : LHOOK, DR_HOOK, JPHOOK - IMPLICIT NONE - INTEGER(KIND=JPIM) :: N - REAL(KIND=JPRB) :: A(*) - REAL(KIND=JPRB) :: TRIGS_(N) - INTEGER(KIND=JPIM) :: IFAX_(10) - - INTEGER(KIND=JPIM) :: INC - INTEGER(KIND=JPIM) :: JUMP - INTEGER(KIND=JPIM) :: LOT - INTEGER(KIND=JPIM) :: ISIGN - - REAL(KIND=JPRB),ALLOCATABLE,DIMENSION(:),SAVE :: WORK , TRIGS - INTEGER(KIND=JPIM),SAVE :: IFAX (32) - - - INTEGER(KIND=JPIM), SAVE :: N_OLD=-1 - INTEGER(KIND=JPIM), SAVE :: LOT_OLD=-1 - -!$OMP threadprivate(ifax,n_old,lot_old,trigs,work) - - - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - IF (LHOOK) CALL DR_HOOK('FFT992',0,ZHOOK_HANDLE) - IF (N .NE. N_OLD) THEN - - IF( ALLOCATED( WORK ) ) DEALLOCATE( WORK ) - IF( ALLOCATED( TRIGS ) ) DEALLOCATE( TRIGS ) - - ALLOCATE(WORK(3*N*LOT)) - ALLOCATE(TRIGS(2*N)) - - CALL DFTFAX ( N, IFAX, TRIGS ) - - N_OLD = N - LOT_OLD = LOT - - ELSE - - IF (LOT .GT. LOT_OLD) THEN - - IF( ALLOCATED( WORK ) ) DEALLOCATE( WORK ) - ALLOCATE(WORK(3*N*LOT)) - LOT_OLD = LOT - - ENDIF - - ENDIF - - CALL DFFTMLT ( A, WORK, TRIGS, IFAX, INC, JUMP, N, LOT, ISIGN ) - - IF (LHOOK) CALL DR_HOOK('FFT992',1,ZHOOK_HANDLE) - RETURN - - END SUBROUTINE FFT992 -#else -! -! SUBROUTINE 'FFT992' - MULTIPLE FAST REAL PERIODIC TRANSFORM -! -! Author: Clive Temperton, January 1998 -! -! This routine is a modernized and enhanced version of FFT991 -! - Cray directives and ancient Fortran constructs removed -! - "vector chopping" removed -! - WORK array is now dynamically allocated -! - stride in WORK array is now always 1 -! -! REAL TRANSFORM OF LENGTH N PERFORMED BY REMOVING REDUNDANT -! OPERATIONS FROM COMPLEX TRANSFORM OF LENGTH N -! -! A IS THE ARRAY CONTAINING INPUT & OUTPUT DATA -! TRIGS IS A PREVIOUSLY PREPARED LIST OF TRIG FUNCTION VALUES -! IFAX IS A PREVIOUSLY PREPARED LIST OF FACTORS OF N -! INC IS THE INCREMENT WITHIN EACH DATA 'VECTOR' -! (E.G. INC=1 FOR CONSECUTIVELY STORED DATA) -! JUMP IS THE INCREMENT BETWEEN THE START OF EACH DATA VECTOR -! N IS THE LENGTH OF THE DATA VECTORS -! LOT IS THE NUMBER OF DATA VECTORS -! ISIGN = +1 FOR TRANSFORM FROM SPECTRAL TO GRIDPOINT -! = -1 FOR TRANSFORM FROM GRIDPOINT TO SPECTRAL -! -! ORDERING OF COEFFICIENTS: -! A(0),B(0),A(1),B(1),A(2),B(2),...,A(N/2),B(N/2) -! WHERE B(0)=B(N/2)=0; (N+2) LOCATIONS REQUIRED -! -! ORDERING OF DATA: -! X(0),X(1),X(2),...,X(N-1), 0 , 0 ; (N+2) LOCATIONS REQUIRED -! -! VECTORIZATION IS ACHIEVED BY DOING THE TRANSFORMS IN PARALLEL -! -! N MUST BE COMPOSED OF FACTORS 2,3 & 5 BUT DOES NOT HAVE TO BE EVEN -! -! DEFINITION OF TRANSFORMS: -! ------------------------- -! -! ISIGN=+1: X(J)=SUM(K=0,...,N-1)(C(K)*EXP(2*I*J*K*PI/N)) -! WHERE C(K)=A(K)+I*B(K) AND C(N-K)=A(K)-I*B(K) -! -! ISIGN=-1: A(K)=(1/N)*SUM(J=0,...,N-1)(X(J)*COS(2*J*K*PI/N)) -! B(K)=-(1/N)*SUM(J=0,...,N-1)(X(J)*SIN(2*J*K*PI/N)) -! - SUBROUTINE FFT992(A,TRIGS,IFAX,INC,JUMP,N,LOT,ISIGN) -!disabled for now. REK.!DEC$ OPTIMIZE:3 -!AUTOPROMOTE - USE PARKIND1, ONLY : JPIM, JPRB - USE YOMHOOK , ONLY : LHOOK, DR_HOOK, JPHOOK - IMPLICIT NONE -! - INTEGER(KIND=JPIM) :: N - INTEGER(KIND=JPIM) :: INC - INTEGER(KIND=JPIM) :: JBASE - INTEGER(KIND=JPIM) :: JUMP - INTEGER(KIND=JPIM) :: J,JJ,JUMPA - INTEGER(KIND=JPIM) :: LOT - INTEGER(KIND=JPIM) :: K,LA,NFAX - INTEGER(KIND=JPIM) :: ISIGN - INTEGER(KIND=JPIM) :: I,IA,IBASE,IERR,IFAC,IGO,II,INCA,IX - - REAL(KIND=JPRB) :: A(*) - REAL(KIND=JPRB) :: TRIGS(N) - INTEGER(KIND=JPIM) :: IFAX(10) -! Dynamically allocated work array: - REAL(KIND=JPRB) :: WORK(N*LOT+1) - LOGICAL :: LIPL -! - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - IF (LHOOK) CALL DR_HOOK('FFT992',0,ZHOOK_HANDLE) - NFAX=IFAX(1) - IF (ISIGN.EQ.+1) THEN -! -! ISIGN=+1, SPECTRAL TO GRIDPOINT TRANSFORM -! ----------------------------------------- -! - I=1 -!OCL NOVREC -!DEC$ IVDEP - DO J=1,LOT - A(I+INC)=0.5_JPRB*A(I) - I=I+JUMP - ENDDO - IF (MOD(N,2).EQ.0) THEN - I=N*INC+1 -!OCL NOVREC -!DEC$ IVDEP - DO J=1,LOT - A(I)=0.5_JPRB*A(I) - I=I+JUMP - ENDDO - ENDIF -! - IA=INC+1 - LA=1 - IGO=+1 -! - DO K=1,NFAX - IFAC=IFAX(K+1) - IERR=-1 - IF (K.EQ.NFAX.AND.NFAX.GT.2.AND.IGO.EQ.+1) THEN - LIPL=.TRUE. - ELSE - LIPL=.FALSE. - ENDIF - IF (INC.EQ.1.AND.JUMP.LT.(2*N).AND. & - & K.GT.1.AND.K.LT.(NFAX-MOD(NFAX,2))) THEN - INCA=LOT - JUMPA=1 - ELSE - INCA=INC - JUMPA=JUMP - ENDIF - IF (IGO.EQ.+1) THEN -!DEC$ FORCEINLINE - CALL RPASSF(A(IA),A(IA+LA*INCA),WORK(1),WORK(IFAC*LA*LOT+1), & - & TRIGS,INCA,LOT,JUMPA,1,LOT,N,IFAC,LA,IERR,LIPL) - ELSE -!DEC$ FORCEINLINE - CALL RPASSF(WORK(1),WORK(LA*LOT+1),A(IA),A(IA+IFAC*LA*INCA), & - & TRIGS,LOT,INCA,1,JUMPA,LOT,N,IFAC,LA,IERR,LIPL) - ENDIF - IF (IERR.NE.0) THEN - IF (IERR.EQ.2) WRITE(6,901) IFAC - IF (IERR.EQ.3) WRITE(6,902) IFAC - IF (LHOOK) CALL DR_HOOK('FFT992',1,ZHOOK_HANDLE) - RETURN - ENDIF - LA=IFAC*LA - IGO=-IGO - IA=1 - ENDDO -! -! IF NECESSARY, COPY RESULTS BACK TO A -! ------------------------------------ - IF (NFAX.EQ.1) THEN - IBASE=1 - JBASE=1 - DO JJ=1,N - I=IBASE - J=JBASE - DO II=1,LOT - A(J)=WORK(I) - I=I+1 - J=J+JUMP - ENDDO - IBASE=IBASE+LOT - JBASE=JBASE+INC - ENDDO - ENDIF -! -! FILL IN ZEROS AT END -! -------------------- - IX=N*INC+1 -!OCL NOVREC -!DEC$ IVDEP - DO J=1,LOT - A(IX)=0.0_JPRB - A(IX+INC)=0.0_JPRB - IX=IX+JUMP - ENDDO -! - ELSEIF (ISIGN.EQ.-1) THEN -! -! ISIGN=-1, GRIDPOINT TO SPECTRAL TRANSFORM -! ----------------------------------------- - IA=1 - LA=N - IGO=+1 -! - DO K=1,NFAX - IFAC=IFAX(NFAX+2-K) - LA=LA/IFAC - IERR=-1 - IF (K.EQ.1.AND.NFAX.GT.2.AND.MOD(NFAX,2).EQ.1) THEN - LIPL=.TRUE. - ELSE - LIPL=.FALSE. - ENDIF - IF (INC.EQ.1.AND.JUMP.LT.(2*N).AND. & - & K.GT.(1+MOD(NFAX,2)).AND.K.LT.NFAX) THEN - INCA=LOT - JUMPA=1 - ELSE - INCA=INC - JUMPA=JUMP - ENDIF - IF (IGO.EQ.+1) THEN -!DEC$ FORCEINLINE - CALL QPASSF(A(IA),A(IA+IFAC*LA*INCA),WORK(1),WORK(LA*LOT+1), & - & TRIGS,INCA,LOT,JUMPA,1,LOT,N,IFAC,LA,IERR,LIPL) - ELSE -!DEC$ FORCEINLINE - CALL QPASSF(WORK(1),WORK(IFAC*LA*LOT+1),A(IA),A(IA+LA*INCA), & - & TRIGS,LOT,INCA,1,JUMPA,LOT,N,IFAC,LA,IERR,LIPL) - ENDIF - IF (IERR.NE.0) THEN - IF (IERR.EQ.2) WRITE(6,901) IFAC - IF (IERR.EQ.3) WRITE(6,902) IFAC - IF (LHOOK) CALL DR_HOOK('FFT992',1,ZHOOK_HANDLE) - RETURN - ENDIF - IF (LIPL) THEN - IA=1 - ELSE - IGO=-IGO - IA=INC+1 - ENDIF - ENDDO -! -! IF NECESSARY, COPY RESULTS BACK TO A -! ------------------------------------ - IF (NFAX.EQ.1) THEN - IBASE=1 - JBASE=INC+1 - DO JJ=1,N - I=IBASE - J=JBASE - DO II=1,LOT - A(J)=WORK(I) - I=I+1 - J=J+JUMP - ENDDO - IBASE=IBASE+LOT - JBASE=JBASE+INC - ENDDO - ENDIF -! -! SHIFT A(0) & FILL IN ZERO IMAG PARTS -! ------------------------------------ - IX=1 -!OCL NOVREC -!DEC$ IVDEP - DO J=1,LOT - A(IX)=A(IX+INC) - A(IX+INC)=0.0_JPRB - IX=IX+JUMP - ENDDO - IF (MOD(N,2).EQ.0) THEN - IX=(N+1)*INC+1 - DO J=1,LOT - A(IX)=0.0_JPRB - IX=IX+JUMP - ENDDO - ENDIF -! - ENDIF -! -! FORMAT STATEMENTS FOR ERROR MESSAGES: - 901 FORMAT(' FACTOR =',I3,' NOT CATERED FOR') - 902 FORMAT(' FACTOR =',I3,' ONLY CATERED FOR IF LA*IFAC=N') -! - IF (LHOOK) CALL DR_HOOK('FFT992',1,ZHOOK_HANDLE) - - CONTAINS -! SUBROUTINE 'RPASSF' - PERFORMS ONE PASS THROUGH DATA AS PART -! OF MULTIPLE REAL FFT (FOURIER SYNTHESIS) ROUTINE -! -! A IS FIRST REAL INPUT VECTOR -! EQUIVALENCE B(1) WITH A (LA*INC1+1) -! C IS FIRST REAL OUTPUT VECTOR -! EQUIVALENCE D(1) WITH C(IFAC*LA*INC2+1) -! TRIGS IS A PRECALCULATED LIST OF SINES & COSINES -! INC1 IS THE ADDRESSING INCREMENT FOR A -! INC2 IS THE ADDRESSING INCREMENT FOR C -! INC3 IS THE INCREMENT BETWEEN INPUT VECTORS A -! INC4 IS THE INCREMENT BETWEEN OUTPUT VECTORS C -! LOT IS THE NUMBER OF VECTORS -! N IS THE LENGTH OF THE VECTORS -! IFAC IS THE CURRENT FACTOR OF N -! LA IS THE PRODUCT OF PREVIOUS FACTORS -! IERR IS AN ERROR INDICATOR: -! 0 - PASS COMPLETED WITHOUT ERROR -! 1 - LOT GREATER THAN 64 -! 2 - IFAC NOT CATERED FOR -! 3 - IFAC ONLY CATERED FOR IF LA=N/IFAC -! LIPL=.T. => RESULTS ARE RETURNED TO INPUT ARRAY -! (ONLY VALID IF LA=N/IFAC, I.E. ON LAST PASS) -! -!----------------------------------------------------------------------- -! - SUBROUTINE RPASSF(A,B,C,D,TRIGS,INC1,INC2,INC3,INC4,LOT,N,IFAC, & - & LA,IERR,LIPL) -!AUTOPROMOTE -! - USE PARKIND1, ONLY : JPIM, JPRB - USE YOMHOOK , ONLY : DR_HOOK, JPHOOK -! - IMPLICIT NONE -! - INTEGER(KIND=JPIM) :: N - REAL(KIND=JPRB) :: A(*) - REAL(KIND=JPRB) :: B(*) - REAL(KIND=JPRB) :: C(*) - REAL(KIND=JPRB) :: D(*) - REAL(KIND=JPRB) :: TRIGS(N) - REAL(KIND=JPRB) :: A10,A11,A20,A21 - REAL(KIND=JPRB) :: B10,B11,B20,B21 - REAL(KIND=JPRB) :: C1,C2,C3,C4,C5 - REAL(KIND=JPRB) :: S1,S2,S3,S4,S5 - REAL(KIND=JPRB) :: SIN36,SIN45,SIN60,SIN72 - REAL(KIND=JPRB) :: SSIN36,SSIN45,SSIN60,SSIN72 - REAL(KIND=JPRB) :: QRT5,QQRT5 - REAL(KIND=JPRB) :: T1,T2,T3,T4,T5,T6,T7 - INTEGER(KIND=JPIM) :: IERR - INTEGER(KIND=JPIM) :: INC1 - INTEGER(KIND=JPIM) :: INC2 - INTEGER(KIND=JPIM) :: INC3 - INTEGER(KIND=JPIM) :: INC4 - INTEGER(KIND=JPIM) :: LOT - INTEGER(KIND=JPIM) :: IFAC - INTEGER(KIND=JPIM) :: LA - INTEGER(KIND=JPIM) :: INC21,IINK,IJK,ILOT,ILA - INTEGER(KIND=JPIM) :: I,IA,IB,IBAD,IBASE,IC,ID,IE,IF - INTEGER(KIND=JPIM) :: J,JA,JB,JBASE,JC,JD,JE,JF,JG,JH,JINK,JUMP - INTEGER(KIND=JPIM) :: K,KB,KC,KD,KE,KF,KSTOP - INTEGER(KIND=JPIM) :: L,M - LOGICAL :: LIPL -! - DATA SIN36/0.587785252292473_JPRB/,SIN72/0.951056516295154_JPRB/, & - & QRT5/0.559016994374947_JPRB/,SIN60/0.866025403784437_JPRB/ -! - M=N/IFAC - IINK=LA*INC1 - JINK=LA*INC2 - JUMP=(IFAC-1)*JINK - KSTOP=(N-IFAC)/(2*IFAC) -! - IBASE=0 - JBASE=0 - IBAD=0 -! -! Increase the vector length by fusing the loops if the -! data layout is appropriate: - IF (INC1.EQ.LOT.AND.INC2.EQ.LOT.AND.INC3.EQ.1.AND.INC4.EQ.1) THEN - ILA=1 - ILOT=LA*LOT - INC21=LA*LOT - ELSE - ILA=LA - ILOT=LOT - INC21=INC2 - ENDIF -! - IF (IFAC.EQ.2) THEN -! -! CODING FOR FACTOR 2 -! ------------------- - 200 CONTINUE - IA=1 - IB=IA+(2*M-LA)*INC1 - JA=1 - JB=JA+JINK -! - IF (LA.NE.M) THEN -! - DO 220 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 210 IJK=1,ILOT - C(JA+J)=A(IA+I)+A(IB+I) - C(JB+J)=A(IA+I)-A(IB+I) - I=I+INC3 - J=J+INC4 - 210 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 220 CONTINUE - IA=IA+IINK - IINK=2*IINK - IB=IB-IINK - IBASE=0 - JBASE=JBASE+JUMP - JUMP=2*JUMP+JINK -! - IF (IA.LT.IB) THEN - DO 250 K=LA,KSTOP,LA - KB=K+K - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - IBASE=0 - DO 240 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 230 IJK=1,ILOT - C(JA+J)=A(IA+I)+A(IB+I) - D(JA+J)=B(IA+I)-B(IB+I) - C(JB+J)=C1*(A(IA+I)-A(IB+I))-S1*(B(IA+I)+B(IB+I)) - D(JB+J)=S1*(A(IA+I)-A(IB+I))+C1*(B(IA+I)+B(IB+I)) - I=I+INC3 - J=J+INC4 - 230 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 240 CONTINUE - IA=IA+IINK - IB=IB-IINK - JBASE=JBASE+JUMP - 250 CONTINUE - ENDIF -! - IF (IA.EQ.IB) THEN - IBASE=0 - DO 280 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 270 IJK=1,ILOT - C(JA+J)=A(IA+I) - C(JB+J)=-B(IA+I) - I=I+INC3 - J=J+INC4 - 270 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 280 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - IF (LIPL) THEN - DO 294 L=1,ILA - I=IBASE -!OCL NOVREC -!NEC$ ivdep - DO 292 IJK=1,ILOT - T1=2.0*(A(IA+I)-A(IB+I)) - A(IA+I)=2.0_JPRB*(A(IA+I)+A(IB+I)) - A(IB+I)=T1 - I=I+INC3 - 292 CONTINUE - IBASE=IBASE+INC1 - 294 CONTINUE - ELSE - DO 298 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 296 IJK=1,ILOT - C(JA+J)=2.0_JPRB*(A(IA+I)+A(IB+I)) - C(JB+J)=2.0_JPRB*(A(IA+I)-A(IB+I)) - I=I+INC3 - J=J+INC4 - 296 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 298 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.3) THEN -! -! CODING FOR FACTOR 3 -! ------------------- - 300 CONTINUE - IA=1 - IB=IA+(2*M-LA)*INC1 - IC=IB - JA=1 - JB=JA+JINK - JC=JB+JINK -! - IF (LA.NE.M) THEN -! - DO 320 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 310 IJK=1,ILOT - C(JA+J)=A(IA+I)+A(IB+I) - C(JB+J)=(A(IA+I)-0.5_JPRB*A(IB+I))-(SIN60*(B(IB+I))) - C(JC+J)=(A(IA+I)-0.5_JPRB*A(IB+I))+(SIN60*(B(IB+I))) - I=I+INC3 - J=J+INC4 - 310 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 320 CONTINUE - IA=IA+IINK - IINK=2*IINK - IB=IB+IINK - IC=IC-IINK - JBASE=JBASE+JUMP - JUMP=2*JUMP+JINK -! - IF (IA.LT.IC) THEN - DO 350 K=LA,KSTOP,LA - KB=K+K - KC=KB+KB - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - C2=TRIGS(KC+1) - S2=TRIGS(KC+2) - IBASE=0 - DO 340 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 330 IJK=1,ILOT - C(JA+J)=A(IA+I)+(A(IB+I)+A(IC+I)) - D(JA+J)=B(IA+I)+(B(IB+I)-B(IC+I)) - C(JB+J)= & - & C1*((A(IA+I)-0.5_JPRB*(A(IB+I)+A(IC+I)))- & - & (SIN60*(B(IB+I)+B(IC+I)))) & - & -S1*((B(IA+I)-0.5_JPRB*(B(IB+I)-B(IC+I)))+ & - & (SIN60*(A(IB+I)-A(IC+I)))) - D(JB+J)= & - & S1*((A(IA+I)-0.5_JPRB*(A(IB+I)+A(IC+I)))- & - & (SIN60*(B(IB+I)+B(IC+I)))) & - & +C1*((B(IA+I)-0.5_JPRB*(B(IB+I)-B(IC+I)))+ & - & (SIN60*(A(IB+I)-A(IC+I)))) - C(JC+J)= & - & C2*((A(IA+I)-0.5_JPRB*(A(IB+I)+A(IC+I)))+ & - & (SIN60*(B(IB+I)+B(IC+I)))) & - & -S2*((B(IA+I)-0.5_JPRB*(B(IB+I)-B(IC+I)))- & - & (SIN60*(A(IB+I)-A(IC+I)))) - D(JC+J)= & - & S2*((A(IA+I)-0.5_JPRB*(A(IB+I)+A(IC+I)))+ & - & (SIN60*(B(IB+I)+B(IC+I)))) & - & +C2*((B(IA+I)-0.5_JPRB*(B(IB+I)-B(IC+I)))- & - & (SIN60*(A(IB+I)-A(IC+I)))) - I=I+INC3 - J=J+INC4 - 330 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 340 CONTINUE - IA=IA+IINK - IB=IB+IINK - IC=IC-IINK - JBASE=JBASE+JUMP - 350 CONTINUE - ENDIF -! - IF (IA.EQ.IC) THEN - IBASE=0 - DO 380 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 370 IJK=1,ILOT - C(JA+J)=A(IA+I)+A(IB+I) - C(JB+J)=(0.5_JPRB*A(IA+I)-A(IB+I))-(SIN60*B(IA+I)) - C(JC+J)=-(0.5_JPRB*A(IA+I)-A(IB+I))-(SIN60*B(IA+I)) - I=I+INC3 - J=J+INC4 - 370 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 380 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - SSIN60=2.0*SIN60 - IF (LIPL) THEN - DO 394 L=1,ILA - I=IBASE -!OCL NOVREC -!NEC$ ivdep - DO 392 IJK=1,ILOT - T1=(2.0_JPRB*A(IA+I)-A(IB+I))-(SSIN60*B(IB+I)) - T2=(2.0_JPRB*A(IA+I)-A(IB+I))+(SSIN60*B(IB+I)) - A(IA+I)=2.0_JPRB*(A(IA+I)+A(IB+I)) - A(IB+I)=T1 - B(IB+I)=T2 - I=I+INC3 - 392 CONTINUE - IBASE=IBASE+INC1 - 394 CONTINUE - ELSE - DO 398 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 396 IJK=1,ILOT - C(JA+J)=2.0_JPRB*(A(IA+I)+A(IB+I)) - C(JB+J)=(2.0_JPRB*A(IA+I)-A(IB+I))-(SSIN60*B(IB+I)) - C(JC+J)=(2.0_JPRB*A(IA+I)-A(IB+I))+(SSIN60*B(IB+I)) - I=I+INC3 - J=J+INC4 - 396 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 398 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.4) THEN -! -! CODING FOR FACTOR 4 -! ------------------- - 400 CONTINUE - IA=1 - IB=IA+(2*M-LA)*INC1 - IC=IB+2*M*INC1 - ID=IB - JA=1 - JB=JA+JINK - JC=JB+JINK - JD=JC+JINK -! - IF (LA.NE.M) THEN -! - DO 420 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 410 IJK=1,ILOT - C(JA+J)=(A(IA+I)+A(IC+I))+A(IB+I) - C(JB+J)=(A(IA+I)-A(IC+I))-B(IB+I) - C(JC+J)=(A(IA+I)+A(IC+I))-A(IB+I) - C(JD+J)=(A(IA+I)-A(IC+I))+B(IB+I) - I=I+INC3 - J=J+INC4 - 410 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 420 CONTINUE - IA=IA+IINK - IINK=2*IINK - IB=IB+IINK - IC=IC-IINK - ID=ID-IINK - JBASE=JBASE+JUMP - JUMP=2*JUMP+JINK -! - IF (IB.LT.IC) THEN - DO 450 K=LA,KSTOP,LA - KB=K+K - KC=KB+KB - KD=KC+KB - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - C2=TRIGS(KC+1) - S2=TRIGS(KC+2) - C3=TRIGS(KD+1) - S3=TRIGS(KD+2) - IBASE=0 - DO 440 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 430 IJK=1,ILOT - C(JA+J)=(A(IA+I)+A(IC+I))+(A(IB+I)+A(ID+I)) - D(JA+J)=(B(IA+I)-B(IC+I))+(B(IB+I)-B(ID+I)) - C(JC+J)= & - & C2*((A(IA+I)+A(IC+I))-(A(IB+I)+A(ID+I))) & - & -S2*((B(IA+I)-B(IC+I))-(B(IB+I)-B(ID+I))) - D(JC+J)= & - & S2*((A(IA+I)+A(IC+I))-(A(IB+I)+A(ID+I))) & - & +C2*((B(IA+I)-B(IC+I))-(B(IB+I)-B(ID+I))) - C(JB+J)= & - & C1*((A(IA+I)-A(IC+I))-(B(IB+I)+B(ID+I))) & - & -S1*((B(IA+I)+B(IC+I))+(A(IB+I)-A(ID+I))) - D(JB+J)= & - & S1*((A(IA+I)-A(IC+I))-(B(IB+I)+B(ID+I))) & - & +C1*((B(IA+I)+B(IC+I))+(A(IB+I)-A(ID+I))) - C(JD+J)= & - & C3*((A(IA+I)-A(IC+I))+(B(IB+I)+B(ID+I))) & - & -S3*((B(IA+I)+B(IC+I))-(A(IB+I)-A(ID+I))) - D(JD+J)= & - & S3*((A(IA+I)-A(IC+I))+(B(IB+I)+B(ID+I))) & - & +C3*((B(IA+I)+B(IC+I))-(A(IB+I)-A(ID+I))) - I=I+INC3 - J=J+INC4 - 430 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 440 CONTINUE - IA=IA+IINK - IB=IB+IINK - IC=IC-IINK - ID=ID-IINK - JBASE=JBASE+JUMP - 450 CONTINUE - ENDIF -! - IF (IB.EQ.IC) THEN - IBASE=0 - SIN45=SQRT(0.5_JPRB) - DO 480 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 470 IJK=1,ILOT - C(JA+J)=A(IA+I)+A(IB+I) - C(JB+J)=SIN45*((A(IA+I)-A(IB+I))-(B(IA+I)+B(IB+I))) - C(JC+J)=B(IB+I)-B(IA+I) - C(JD+J)=-SIN45*((A(IA+I)-A(IB+I))+(B(IA+I)+B(IB+I))) - I=I+INC3 - J=J+INC4 - 470 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 480 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - IF (LIPL) THEN - DO 494 L=1,ILA - I=IBASE -!OCL NOVREC -!NEC$ ivdep - DO 492 IJK=1,ILOT - T1=2.0_JPRB*((A(IA+I)-A(IC+I))-B(IB+I)) - T2=2.0_JPRB*((A(IA+I)+A(IC+I))-A(IB+I)) - T3=2.0_JPRB*((A(IA+I)-A(IC+I))+B(IB+I)) - A(IA+I)=2.0_JPRB*((A(IA+I)+A(IC+I))+A(IB+I)) - A(IB+I)=T1 - B(IB+I)=T2 - A(IC+I)=T3 - I=I+INC3 - 492 CONTINUE - IBASE=IBASE+INC1 - 494 CONTINUE - ELSE - DO 498 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 496 IJK=1,ILOT - C(JA+J)=2.0_JPRB*((A(IA+I)+A(IC+I))+A(IB+I)) - C(JB+J)=2.0_JPRB*((A(IA+I)-A(IC+I))-B(IB+I)) - C(JC+J)=2.0_JPRB*((A(IA+I)+A(IC+I))-A(IB+I)) - C(JD+J)=2.0_JPRB*((A(IA+I)-A(IC+I))+B(IB+I)) - I=I+INC3 - J=J+INC4 - 496 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 498 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.5) THEN -! -! CODING FOR FACTOR 5 -! ------------------- - 500 CONTINUE - IA=1 - IB=IA+(2*M-LA)*INC1 - IC=IB+2*M*INC1 - ID=IC - IE=IB - JA=1 - JB=JA+JINK - JC=JB+JINK - JD=JC+JINK - JE=JD+JINK -! - IF (LA.NE.M) THEN -! - DO 520 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 510 IJK=1,ILOT - C(JA+J)=A(IA+I)+(A(IB+I)+A(IC+I)) - C(JB+J)=((A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I)))+ & - & QRT5*(A(IB+I)-A(IC+I)))-(SIN72*B(IB+I)+SIN36*B(IC+I)) - C(JC+J)=((A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I)))- & - & QRT5*(A(IB+I)-A(IC+I)))-(SIN36*B(IB+I)-SIN72*B(IC+I)) - C(JD+J)=((A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I)))- & - & QRT5*(A(IB+I)-A(IC+I)))+(SIN36*B(IB+I)-SIN72*B(IC+I)) - C(JE+J)=((A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I)))+ & - & QRT5*(A(IB+I)-A(IC+I)))+(SIN72*B(IB+I)+SIN36*B(IC+I)) - I=I+INC3 - J=J+INC4 - 510 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 520 CONTINUE - IA=IA+IINK - IINK=2*IINK - IB=IB+IINK - IC=IC+IINK - ID=ID-IINK - IE=IE-IINK - JBASE=JBASE+JUMP - JUMP=2*JUMP+JINK -! - IF (IB.LT.ID) THEN - DO 550 K=LA,KSTOP,LA - KB=K+K - KC=KB+KB - KD=KC+KB - KE=KD+KB - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - C2=TRIGS(KC+1) - S2=TRIGS(KC+2) - C3=TRIGS(KD+1) - S3=TRIGS(KD+2) - C4=TRIGS(KE+1) - S4=TRIGS(KE+2) - IBASE=0 - DO 540 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 530 IJK=1,ILOT -! - A10=(A(IA+I)-0.25_JPRB*((A(IB+I)+A(IE+I))+(A(IC+I)+A(ID+I)))) & - & +QRT5*((A(IB+I)+A(IE+I))-(A(IC+I)+A(ID+I))) - A20=(A(IA+I)-0.25_JPRB*((A(IB+I)+A(IE+I))+(A(IC+I)+A(ID+I)))) & - & -QRT5*((A(IB+I)+A(IE+I))-(A(IC+I)+A(ID+I))) - B10=(B(IA+I)-0.25_JPRB*((B(IB+I)-B(IE+I))+(B(IC+I)-B(ID+I)))) & - & +QRT5*((B(IB+I)-B(IE+I))-(B(IC+I)-B(ID+I))) - B20=(B(IA+I)-0.25_JPRB*((B(IB+I)-B(IE+I))+(B(IC+I)-B(ID+I)))) & - & -QRT5*((B(IB+I)-B(IE+I))-(B(IC+I)-B(ID+I))) - A11=SIN72*(B(IB+I)+B(IE+I))+SIN36*(B(IC+I)+B(ID+I)) - A21=SIN36*(B(IB+I)+B(IE+I))-SIN72*(B(IC+I)+B(ID+I)) - B11=SIN72*(A(IB+I)-A(IE+I))+SIN36*(A(IC+I)-A(ID+I)) - B21=SIN36*(A(IB+I)-A(IE+I))-SIN72*(A(IC+I)-A(ID+I)) -! - C(JA+J)=A(IA+I)+((A(IB+I)+A(IE+I))+(A(IC+I)+A(ID+I))) - D(JA+J)=B(IA+I)+((B(IB+I)-B(IE+I))+(B(IC+I)-B(ID+I))) - C(JB+J)=C1*(A10-A11)-S1*(B10+B11) - D(JB+J)=S1*(A10-A11)+C1*(B10+B11) - C(JE+J)=C4*(A10+A11)-S4*(B10-B11) - D(JE+J)=S4*(A10+A11)+C4*(B10-B11) - C(JC+J)=C2*(A20-A21)-S2*(B20+B21) - D(JC+J)=S2*(A20-A21)+C2*(B20+B21) - C(JD+J)=C3*(A20+A21)-S3*(B20-B21) - D(JD+J)=S3*(A20+A21)+C3*(B20-B21) -! - I=I+INC3 - J=J+INC4 - 530 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 540 CONTINUE - IA=IA+IINK - IB=IB+IINK - IC=IC+IINK - ID=ID-IINK - IE=IE-IINK - JBASE=JBASE+JUMP - 550 CONTINUE - ENDIF -! - IF (IB.EQ.ID) THEN - IBASE=0 - DO 580 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 570 IJK=1,ILOT - C(JA+J)=(A(IA+I)+A(IB+I))+A(IC+I) - C(JB+J)=(QRT5*(A(IA+I)-A(IB+I))+ & - & (0.25_JPRB*(A(IA+I)+A(IB+I))-A(IC+I))) & - & -(SIN36*B(IA+I)+SIN72*B(IB+I)) - C(JE+J)=-(QRT5*(A(IA+I)-A(IB+I))+ & - & (0.25_JPRB*(A(IA+I)+A(IB+I))-A(IC+I))) & - & -(SIN36*B(IA+I)+SIN72*B(IB+I)) - C(JC+J)=(QRT5*(A(IA+I)-A(IB+I))- & - & (0.25_JPRB*(A(IA+I)+A(IB+I))-A(IC+I))) & - & -(SIN72*B(IA+I)-SIN36*B(IB+I)) - C(JD+J)=-(QRT5*(A(IA+I)-A(IB+I))- & - & (0.25_JPRB*(A(IA+I)+A(IB+I))-A(IC+I))) & - & -(SIN72*B(IA+I)-SIN36*B(IB+I)) - I=I+INC3 - J=J+INC4 - 570 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 580 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - QQRT5=2.0*QRT5 - SSIN36=2.0*SIN36 - SSIN72=2.0*SIN72 - IF (LIPL) THEN - DO 594 L=1,ILA - I=IBASE -!OCL NOVREC -!NEC$ ivdep - DO 592 IJK=1,ILOT - T1=(2.0_JPRB*(A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I))) & - & +QQRT5*(A(IB+I)-A(IC+I)))-(SSIN72*B(IB+I)+SSIN36*B(IC+I)) - T2=(2.0_JPRB*(A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I))) & - & -QQRT5*(A(IB+I)-A(IC+I)))-(SSIN36*B(IB+I)-SSIN72*B(IC+I)) - T3=(2.0_JPRB*(A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I))) & - & -QQRT5*(A(IB+I)-A(IC+I)))+(SSIN36*B(IB+I)-SSIN72*B(IC+I)) - T4=(2.0_JPRB*(A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I))) & - & +QQRT5*(A(IB+I)-A(IC+I)))+(SSIN72*B(IB+I)+SSIN36*B(IC+I)) - A(IA+I)=2.0_JPRB*(A(IA+I)+(A(IB+I)+A(IC+I))) - A(IB+I)=T1 - B(IB+I)=T2 - A(IC+I)=T3 - B(IC+I)=T4 - I=I+INC3 - 592 CONTINUE - IBASE=IBASE+INC1 - 594 CONTINUE - ELSE - DO 598 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 596 IJK=1,ILOT - C(JA+J)=2.0_JPRB*(A(IA+I)+(A(IB+I)+A(IC+I))) - C(JB+J)=(2.0_JPRB*(A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I))) & - & +QQRT5*(A(IB+I)-A(IC+I)))-(SSIN72*B(IB+I)+SSIN36*B(IC+I)) - C(JC+J)=(2.0_JPRB*(A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I))) & - & -QQRT5*(A(IB+I)-A(IC+I)))-(SSIN36*B(IB+I)-SSIN72*B(IC+I)) - C(JD+J)=(2.0_JPRB*(A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I))) & - & -QQRT5*(A(IB+I)-A(IC+I)))+(SSIN36*B(IB+I)-SSIN72*B(IC+I)) - C(JE+J)=(2.0_JPRB*(A(IA+I)-0.25_JPRB*(A(IB+I)+A(IC+I))) & - & +QQRT5*(A(IB+I)-A(IC+I)))+(SSIN72*B(IB+I)+SSIN36*B(IC+I)) - I=I+INC3 - J=J+INC4 - 596 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 598 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.6) THEN -! -! CODING FOR FACTOR 6 -! ------------------- - 600 CONTINUE - IA=1 - IB=IA+(2*M-LA)*INC1 - IC=IB+2*M*INC1 - ID=IC+2*M*INC1 - IE=IC - IF=IB - JA=1 - JB=JA+JINK - JC=JB+JINK - JD=JC+JINK - JE=JD+JINK - JF=JE+JINK -! - IF (LA.NE.M) THEN -! - DO 620 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 610 IJK=1,ILOT - C(JA+J)=(A(IA+I)+A(ID+I))+(A(IB+I)+A(IC+I)) - C(JD+J)=(A(IA+I)-A(ID+I))-(A(IB+I)-A(IC+I)) - C(JB+J)=((A(IA+I)-A(ID+I))+0.5_JPRB*(A(IB+I)-A(IC+I))) & - & -(SIN60*(B(IB+I)+B(IC+I))) - C(JF+J)=((A(IA+I)-A(ID+I))+0.5_JPRB*(A(IB+I)-A(IC+I))) & - & +(SIN60*(B(IB+I)+B(IC+I))) - C(JC+J)=((A(IA+I)+A(ID+I))-0.5_JPRB*(A(IB+I)+A(IC+I))) & - & -(SIN60*(B(IB+I)-B(IC+I))) - C(JE+J)=((A(IA+I)+A(ID+I))-0.5_JPRB*(A(IB+I)+A(IC+I))) & - & +(SIN60*(B(IB+I)-B(IC+I))) - I=I+INC3 - J=J+INC4 - 610 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 620 CONTINUE - IA=IA+IINK - IINK=2*IINK - IB=IB+IINK - IC=IC+IINK - ID=ID-IINK - IE=IE-IINK - IF=IF-IINK - JBASE=JBASE+JUMP - JUMP=2*JUMP+JINK -! - IF (IC.LT.ID) THEN - DO 650 K=LA,KSTOP,LA - KB=K+K - KC=KB+KB - KD=KC+KB - KE=KD+KB - KF=KE+KB - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - C2=TRIGS(KC+1) - S2=TRIGS(KC+2) - C3=TRIGS(KD+1) - S3=TRIGS(KD+2) - C4=TRIGS(KE+1) - S4=TRIGS(KE+2) - C5=TRIGS(KF+1) - S5=TRIGS(KF+2) - IBASE=0 - DO 640 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 630 IJK=1,ILOT -! - A11= (A(IE+I)+A(IB+I))+(A(IC+I)+A(IF+I)) - A20=(A(IA+I)+A(ID+I))-0.5_JPRB*A11 - A21=SIN60*((A(IE+I)+A(IB+I))-(A(IC+I)+A(IF+I))) - B11= (B(IB+I)-B(IE+I))+(B(IC+I)-B(IF+I)) - B20=(B(IA+I)-B(ID+I))-0.5_JPRB*B11 - B21=SIN60*((B(IB+I)-B(IE+I))-(B(IC+I)-B(IF+I))) -! - C(JA+J)=(A(IA+I)+A(ID+I))+A11 - D(JA+J)=(B(IA+I)-B(ID+I))+B11 - C(JC+J)=C2*(A20-B21)-S2*(B20+A21) - D(JC+J)=S2*(A20-B21)+C2*(B20+A21) - C(JE+J)=C4*(A20+B21)-S4*(B20-A21) - D(JE+J)=S4*(A20+B21)+C4*(B20-A21) -! - A11=(A(IE+I)-A(IB+I))+(A(IC+I)-A(IF+I)) - B11=(B(IE+I)+B(IB+I))-(B(IC+I)+B(IF+I)) - A20=(A(IA+I)-A(ID+I))-0.5_JPRB*A11 - A21=SIN60*((A(IE+I)-A(IB+I))-(A(IC+I)-A(IF+I))) - B20=(B(IA+I)+B(ID+I))+0.5_JPRB*B11 - B21=SIN60*((B(IE+I)+B(IB+I))+(B(IC+I)+B(IF+I))) -! - C(JD+J)= & - & C3*((A(IA+I)-A(ID+I))+A11)-S3*((B(IA+I)+B(ID+I))-B11) - D(JD+J)= & - & S3*((A(IA+I)-A(ID+I))+A11)+C3*((B(IA+I)+B(ID+I))-B11) - C(JB+J)=C1*(A20-B21)-S1*(B20-A21) - D(JB+J)=S1*(A20-B21)+C1*(B20-A21) - C(JF+J)=C5*(A20+B21)-S5*(B20+A21) - D(JF+J)=S5*(A20+B21)+C5*(B20+A21) -! - I=I+INC3 - J=J+INC4 - 630 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 640 CONTINUE - IA=IA+IINK - IB=IB+IINK - IC=IC+IINK - ID=ID-IINK - IE=IE-IINK - IF=IF-IINK - JBASE=JBASE+JUMP - 650 CONTINUE - ENDIF -! - IF (IC.EQ.ID) THEN - IBASE=0 - DO 680 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 670 IJK=1,ILOT - C(JA+J)=A(IB+I)+(A(IA+I)+A(IC+I)) - C(JD+J)=B(IB+I)-(B(IA+I)+B(IC+I)) - C(JB+J)=(SIN60*(A(IA+I)-A(IC+I)))- & - & (0.5_JPRB*(B(IA+I)+B(IC+I))+B(IB+I)) - C(JF+J)=-(SIN60*(A(IA+I)-A(IC+I)))- & - & (0.5_JPRB*(B(IA+I)+B(IC+I))+B(IB+I)) - C(JC+J)=SIN60*(B(IC+I)-B(IA+I))+ & - & (0.5_JPRB*(A(IA+I)+A(IC+I))-A(IB+I)) - C(JE+J)=SIN60*(B(IC+I)-B(IA+I))- & - & (0.5_JPRB*(A(IA+I)+A(IC+I))-A(IB+I)) - I=I+INC3 - J=J+INC4 - 670 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 680 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - SSIN60=2.0_JPRB*SIN60 - IF (LIPL) THEN - DO 694 L=1,ILA - I=IBASE -!OCL NOVREC -!NEC$ ivdep - DO 692 IJK=1,ILOT - T1=(2.0_JPRB*(A(IA+I)-A(ID+I))+(A(IB+I)-A(IC+I))) & - & -(SSIN60*(B(IB+I)+B(IC+I))) - T5=(2.0_JPRB*(A(IA+I)-A(ID+I))+(A(IB+I)-A(IC+I))) & - & +(SSIN60*(B(IB+I)+B(IC+I))) - T2=(2.0_JPRB*(A(IA+I)+A(ID+I))-(A(IB+I)+A(IC+I))) & - & -(SSIN60*(B(IB+I)-B(IC+I))) - T4=(2.0_JPRB*(A(IA+I)+A(ID+I))-(A(IB+I)+A(IC+I))) & - & +(SSIN60*(B(IB+I)-B(IC+I))) - T3=(2.0_JPRB*(A(IA+I)-A(ID+I)))-(2.0_JPRB*(A(IB+I)-A(IC+I))) - A(IA+I)=(2.0_JPRB*(A(IA+I)+A(ID+I)))+ & - & (2.0_JPRB*(A(IB+I)+A(IC+I))) - A(IB+I)=T1 - B(IB+I)=T2 - A(IC+I)=T3 - B(IC+I)=T4 - A(ID+I)=T5 - I=I+INC3 - 692 CONTINUE - IBASE=IBASE+INC1 - 694 CONTINUE - ELSE - DO 698 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 696 IJK=1,ILOT - C(JA+J)=(2.0_JPRB*(A(IA+I)+A(ID+I)))+ & - & (2.0_JPRB*(A(IB+I)+A(IC+I))) - C(JD+J)=(2.0_JPRB*(A(IA+I)-A(ID+I)))- & - & (2.0_JPRB*(A(IB+I)-A(IC+I))) - C(JB+J)=(2.0_JPRB*(A(IA+I)-A(ID+I))+(A(IB+I)-A(IC+I))) & - & -(SSIN60*(B(IB+I)+B(IC+I))) - C(JF+J)=(2.0_JPRB*(A(IA+I)-A(ID+I))+(A(IB+I)-A(IC+I))) & - & +(SSIN60*(B(IB+I)+B(IC+I))) - C(JC+J)=(2.0_JPRB*(A(IA+I)+A(ID+I))-(A(IB+I)+A(IC+I))) & - & -(SSIN60*(B(IB+I)-B(IC+I))) - C(JE+J)=(2.0_JPRB*(A(IA+I)+A(ID+I))-(A(IB+I)+A(IC+I))) & - & +(SSIN60*(B(IB+I)-B(IC+I))) - I=I+INC3 - J=J+INC4 - 696 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 698 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.8) THEN -! -! CODING FOR FACTOR 8 -! ------------------- - 800 CONTINUE - IF (LA.NE.M) THEN - IBAD=3 - ELSE - IA=1 - IB=IA+LA*INC1 - IC=IB+2*LA*INC1 - ID=IC+2*LA*INC1 - IE=ID+2*LA*INC1 - JA=1 - JB=JA+JINK - JC=JB+JINK - JD=JC+JINK - JE=JD+JINK - JF=JE+JINK - JG=JF+JINK - JH=JG+JINK - SSIN45=SQRT(2.0_JPRB) -! - IF (LIPL) THEN - DO 820 L=1,ILA - I=IBASE -!OCL NOVREC -!NEC$ ivdep - DO 810 IJK=1,ILOT - T2=2.0_JPRB*(((A(IA+I)+A(IE+I))-A(IC+I))-(B(IB+I)-B(ID+I))) - T6=2.0_JPRB*(((A(IA+I)+A(IE+I))-A(IC+I))+(B(IB+I)-B(ID+I))) - T1=2.0_JPRB*((A(IA+I)-A(IE+I))-B(IC+I)) & - & +SSIN45*((A(IB+I)-A(ID+I))-(B(IB+I)+B(ID+I))) - T5=2.0_JPRB*((A(IA+I)-A(IE+I))-B(IC+I)) & - & -SSIN45*((A(IB+I)-A(ID+I))-(B(IB+I)+B(ID+I))) - T3=2.0_JPRB*((A(IA+I)-A(IE+I))+B(IC+I)) & - & -SSIN45*((A(IB+I)-A(ID+I))+(B(IB+I)+B(ID+I))) - T7=2.0_JPRB*((A(IA+I)-A(IE+I))+B(IC+I)) & - & +SSIN45*((A(IB+I)-A(ID+I))+(B(IB+I)+B(ID+I))) - T4=2.0_JPRB*(((A(IA+I)+A(IE+I))+A(IC+I))-(A(IB+I)+A(ID+I))) - A(IA+I)=2.0_JPRB*(((A(IA+I)+A(IE+I))+A(IC+I))+(A(IB+I)+A(ID+I))) - A(IB+I)=T1 - B(IB+I)=T2 - A(IC+I)=T3 - B(IC+I)=T4 - A(ID+I)=T5 - B(ID+I)=T6 - A(IE+I)=T7 - I=I+INC3 - 810 CONTINUE - IBASE=IBASE+INC1 - 820 CONTINUE - ELSE - DO 840 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 830 IJK=1,ILOT - C(JA+J)=2.0_JPRB*(((A(IA+I)+A(IE+I))+A(IC+I))+(A(IB+I)+A(ID+I))) - C(JE+J)=2.0_JPRB*(((A(IA+I)+A(IE+I))+A(IC+I))-(A(IB+I)+A(ID+I))) - C(JC+J)=2.0_JPRB*(((A(IA+I)+A(IE+I))-A(IC+I))-(B(IB+I)-B(ID+I))) - C(JG+J)=2.0_JPRB*(((A(IA+I)+A(IE+I))-A(IC+I))+(B(IB+I)-B(ID+I))) - C(JB+J)=2.0_JPRB*((A(IA+I)-A(IE+I))-B(IC+I)) & - & +SSIN45*((A(IB+I)-A(ID+I))-(B(IB+I)+B(ID+I))) - C(JF+J)=2.0_JPRB*((A(IA+I)-A(IE+I))-B(IC+I)) & - & -SSIN45*((A(IB+I)-A(ID+I))-(B(IB+I)+B(ID+I))) - C(JD+J)=2.0_JPRB*((A(IA+I)-A(IE+I))+B(IC+I)) & - & -SSIN45*((A(IB+I)-A(ID+I))+(B(IB+I)+B(ID+I))) - C(JH+J)=2.0_JPRB*((A(IA+I)-A(IE+I))+B(IC+I)) & - & +SSIN45*((A(IB+I)-A(ID+I))+(B(IB+I)+B(ID+I))) - I=I+INC3 - J=J+INC4 - 830 CONTINUE - IBASE=IBASE+INC1 - JBASE=JBASE+INC21 - 840 CONTINUE - ENDIF -! - ENDIF -! - ELSE -! - IBAD=2 !!! Illegal factor -! - ENDIF -! -! RETURN -! ------ - 900 CONTINUE - IERR=IBAD - ENDSUBROUTINE RPASSF - -! SUBROUTINE 'QPASSF' - PERFORMS ONE PASS THROUGH DATA AS PART -! OF MULTIPLE REAL FFT (FOURIER ANALYSIS) ROUTINE -! -! A IS FIRST REAL INPUT VECTOR -! EQUIVALENCE B(1) WITH A(IFAC*LA*INC1+1) -! C IS FIRST REAL OUTPUT VECTOR -! EQUIVALENCE D(1) WITH C(LA*INC2+1) -! TRIGS IS A PRECALCULATED LIST OF SINES & COSINES -! INC1 IS THE ADDRESSING INCREMENT FOR A -! INC2 IS THE ADDRESSING INCREMENT FOR C -! INC3 IS THE INCREMENT BETWEEN INPUT VECTORS A -! INC4 IS THE INCREMENT BETWEEN OUTPUT VECTORS C -! LOT IS THE NUMBER OF VECTORS -! N IS THE LENGTH OF THE VECTORS -! IFAC IS THE CURRENT FACTOR OF N -! LA = N/(PRODUCT OF FACTORS USED SO FAR) -! IERR IS AN ERROR INDICATOR: -! 0 - PASS COMPLETED WITHOUT ERROR -! 1 - LOT GREATER THAN 64 -! 2 - IFAC NOT CATERED FOR -! 3 - IFAC ONLY CATERED FOR IF LA=N/IFAC -! LIPL=.T. => RESULTS ARE RETURNED TO INPUT ARRAY -! (ONLY VALID IF LA=N/IFAC, I.E. ON FIRST PASS) -! -!----------------------------------------------------------------------- -! - SUBROUTINE QPASSF(A,B,C,D,TRIGS,INC1,INC2,INC3,INC4,LOT,N,IFAC, & - & LA,IERR,LIPL) -!AUTOPROMOTE - USE PARKIND1, ONLY : JPIM, JPRB - USE YOMHOOK , ONLY : DR_HOOK, JPHOOK -! - IMPLICIT NONE -! - INTEGER(KIND=JPIM) :: N - REAL(KIND=JPRB) :: A(*) - REAL(KIND=JPRB) :: B(*) - REAL(KIND=JPRB) :: C(*) - REAL(KIND=JPRB) :: D(*) - REAL(KIND=JPRB) :: TRIGS(N) - REAL(KIND=JPRB) :: A0,A1,A2,A3,A4,A5,A6,A10,A11,A20,A21 - REAL(KIND=JPRB) :: B0,B1,B2,B3,B4,B5,B6,B10,B11,B20,B21 - REAL(KIND=JPRB) :: C1,C2,C3,C4,C5 - REAL(KIND=JPRB) :: S1,S2,S3,S4,S5 - REAL(KIND=JPRB) :: T1,T2,T3,T4,T5,T6,T7 - REAL(KIND=JPRB) :: Z - REAL(KIND=JPRB) :: QRT5,SIN36,SIN45,SIN60,SIN72 - REAL(KIND=JPRB) :: ZQRT5,ZSIN36,ZSIN45,ZSIN60,ZSIN72 - INTEGER(KIND=JPIM) :: IERR - INTEGER(KIND=JPIM) :: INC1 - INTEGER(KIND=JPIM) :: INC2 - INTEGER(KIND=JPIM) :: INC3 - INTEGER(KIND=JPIM) :: INC4 - INTEGER(KIND=JPIM) :: LOT - INTEGER(KIND=JPIM) :: IFAC - INTEGER(KIND=JPIM) :: LA - INTEGER(KIND=JPIM) :: IINK,IJK,ILOT - INTEGER(KIND=JPIM) :: I,IA,IB,IBAD,IBASE,IC,ID,IE,IF,IG,IH - INTEGER(KIND=JPIM) :: IJUMP,ILA,INC11 - INTEGER(KIND=JPIM) :: J,JA,JB,JC,JD,JE,JBASE,JF,JINK - INTEGER(KIND=JPIM) :: K,KB,KC,KD,KE,KF,KSTOP - INTEGER(KIND=JPIM) :: L,M - LOGICAL :: LIPL -! - DATA SIN36/0.587785252292473_JPRB/,SIN72/0.951056516295154_JPRB/, & - & QRT5/0.559016994374947_JPRB/,SIN60/0.866025403784437_JPRB/ -! - M=N/IFAC - IINK=LA*INC1 - JINK=LA*INC2 - IJUMP=(IFAC-1)*IINK - KSTOP=(N-IFAC)/(2*IFAC) -! - IBASE=0 - JBASE=0 - IBAD=0 -! -! Increase the vector length by fusing the loops if the -! data layout is appropriate: - IF (INC1.EQ.LOT.AND.INC2.EQ.LOT.AND.INC3.EQ.1.AND.INC4.EQ.1) THEN - ILA=1 - ILOT=LA*LOT - INC11=LA*LOT - ELSE - ILA=LA - ILOT=LOT - INC11=INC1 - ENDIF - -! - IF (IFAC.EQ.2) THEN -! -! CODING FOR FACTOR 2 -! ------------------- - 200 CONTINUE - IA=1 - IB=IA+IINK - JA=1 - JB=JA+(2*M-LA)*INC2 -! - IF (LA.NE.M) THEN -! - DO 220 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 210 IJK=1,ILOT - C(JA+J)=A(IA+I)+A(IB+I) - C(JB+J)=A(IA+I)-A(IB+I) - I=I+INC3 - J=J+INC4 - 210 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 220 CONTINUE - JA=JA+JINK - JINK=2*JINK - JB=JB-JINK - IBASE=IBASE+IJUMP - IJUMP=2*IJUMP+IINK -! - IF (JA.LT.JB) THEN - DO 250 K=LA,KSTOP,LA - KB=K+K - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - JBASE=0 - DO 240 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 230 IJK=1,ILOT - C(JA+J)=A(IA+I)+(C1*A(IB+I)+S1*B(IB+I)) - C(JB+J)=A(IA+I)-(C1*A(IB+I)+S1*B(IB+I)) - D(JA+J)=(C1*B(IB+I)-S1*A(IB+I))+B(IA+I) - D(JB+J)=(C1*B(IB+I)-S1*A(IB+I))-B(IA+I) - I=I+INC3 - J=J+INC4 - 230 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 240 CONTINUE - IBASE=IBASE+IJUMP - JA=JA+JINK - JB=JB-JINK - 250 CONTINUE - ENDIF -! - IF (JA.EQ.JB) THEN - JBASE=0 - DO 280 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 270 IJK=1,ILOT - C(JA+J)=A(IA+I) - D(JA+J)=-A(IB+I) - I=I+INC3 - J=J+INC4 - 270 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 280 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - Z=1.0_JPRB/REAL(N,KIND=JPRB) - IF (LIPL) THEN - DO 294 L=1,ILA - I=IBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 292 IJK=1,ILOT - T1=Z*(A(IA+I)-A(IB+I)) - A(IA+I)=Z*(A(IA+I)+A(IB+I)) - A(IB+I)=T1 - I=I+INC3 - 292 CONTINUE - IBASE=IBASE+INC11 - 294 CONTINUE - ELSE - DO 298 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 296 IJK=1,ILOT - C(JA+J)=Z*(A(IA+I)+A(IB+I)) - C(JB+J)=Z*(A(IA+I)-A(IB+I)) - I=I+INC3 - J=J+INC4 - 296 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 298 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.3) THEN -! -! CODING FOR FACTOR 3 -! ------------------- - 300 CONTINUE - IA=1 - IB=IA+IINK - IC=IB+IINK - JA=1 - JB=JA+(2*M-LA)*INC2 - JC=JB -! - IF (LA.NE.M) THEN -! - DO 320 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 310 IJK=1,ILOT - C(JA+J)=A(IA+I)+(A(IB+I)+A(IC+I)) - C(JB+J)=A(IA+I)-0.5_JPRB*(A(IB+I)+A(IC+I)) - D(JB+J)=SIN60*(A(IC+I)-A(IB+I)) - I=I+INC3 - J=J+INC4 - 310 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 320 CONTINUE - JA=JA+JINK - JINK=2*JINK - JB=JB+JINK - JC=JC-JINK - IBASE=IBASE+IJUMP - IJUMP=2*IJUMP+IINK -! - IF (JA.LT.JC) THEN - DO 350 K=LA,KSTOP,LA - KB=K+K - KC=KB+KB - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - C2=TRIGS(KC+1) - S2=TRIGS(KC+2) - JBASE=0 - DO 340 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 330 IJK=1,ILOT - A1=(C1*A(IB+I)+S1*B(IB+I))+(C2*A(IC+I)+S2*B(IC+I)) - B1=(C1*B(IB+I)-S1*A(IB+I))+(C2*B(IC+I)-S2*A(IC+I)) - A2=A(IA+I)-0.5_JPRB*A1 - B2=B(IA+I)-0.5_JPRB*B1 - A3=SIN60*((C1*A(IB+I)+S1*B(IB+I))-(C2*A(IC+I)+S2*B(IC+I))) - B3=SIN60*((C1*B(IB+I)-S1*A(IB+I))-(C2*B(IC+I)-S2*A(IC+I))) - C(JA+J)=A(IA+I)+A1 - D(JA+J)=B(IA+I)+B1 - C(JB+J)=A2+B3 - D(JB+J)=B2-A3 - C(JC+J)=A2-B3 - D(JC+J)=-(B2+A3) - I=I+INC3 - J=J+INC4 - 330 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 340 CONTINUE - IBASE=IBASE+IJUMP - JA=JA+JINK - JB=JB+JINK - JC=JC-JINK - 350 CONTINUE - ENDIF -! - IF (JA.EQ.JC) THEN - JBASE=0 - DO 380 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 370 IJK=1,ILOT - C(JA+J)=A(IA+I)+0.5_JPRB*(A(IB+I)-A(IC+I)) - D(JA+J)=-SIN60*(A(IB+I)+A(IC+I)) - C(JB+J)=A(IA+I)-(A(IB+I)-A(IC+I)) - I=I+INC3 - J=J+INC4 - 370 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 380 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - Z=1.0_JPRB/REAL(N,KIND=JPRB) - ZSIN60=Z*SIN60 - IF (LIPL) THEN - DO 394 L=1,ILA - I=IBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 392 IJK=1,ILOT - T1=Z*(A(IA+I)-0.5_JPRB*(A(IB+I)+A(IC+I))) - T2=ZSIN60*(A(IC+I)-A(IB+I)) - A(IA+I)=Z*(A(IA+I)+(A(IB+I)+A(IC+I))) - A(IB+I)=T1 - A(IC+I)=T2 - I=I+INC3 - 392 CONTINUE - IBASE=IBASE+INC11 - 394 CONTINUE - ELSE - DO 398 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 396 IJK=1,ILOT - C(JA+J)=Z*(A(IA+I)+(A(IB+I)+A(IC+I))) - C(JB+J)=Z*(A(IA+I)-0.5_JPRB*(A(IB+I)+A(IC+I))) - D(JB+J)=ZSIN60*(A(IC+I)-A(IB+I)) - I=I+INC3 - J=J+INC4 - 396 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 398 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.4) THEN -! -! CODING FOR FACTOR 4 -! ------------------- - 400 CONTINUE - IA=1 - IB=IA+IINK - IC=IB+IINK - ID=IC+IINK - JA=1 - JB=JA+(2*M-LA)*INC2 - JC=JB+2*M*INC2 - JD=JB -! - IF (LA.NE.M) THEN -! - DO 420 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 410 IJK=1,ILOT - C(JA+J)=(A(IA+I)+A(IC+I))+(A(IB+I)+A(ID+I)) - C(JC+J)=(A(IA+I)+A(IC+I))-(A(IB+I)+A(ID+I)) - C(JB+J)=A(IA+I)-A(IC+I) - D(JB+J)=A(ID+I)-A(IB+I) - I=I+INC3 - J=J+INC4 - 410 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 420 CONTINUE - JA=JA+JINK - JINK=2*JINK - JB=JB+JINK - JC=JC-JINK - JD=JD-JINK - IBASE=IBASE+IJUMP - IJUMP=2*IJUMP+IINK -! - IF (JB.LT.JC) THEN - DO 450 K=LA,KSTOP,LA - KB=K+K - KC=KB+KB - KD=KC+KB - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - C2=TRIGS(KC+1) - S2=TRIGS(KC+2) - C3=TRIGS(KD+1) - S3=TRIGS(KD+2) - JBASE=0 - DO 440 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 430 IJK=1,ILOT - A0=A(IA+I)+(C2*A(IC+I)+S2*B(IC+I)) - A2=A(IA+I)-(C2*A(IC+I)+S2*B(IC+I)) - A1=(C1*A(IB+I)+S1*B(IB+I))+(C3*A(ID+I)+S3*B(ID+I)) - A3=(C1*A(IB+I)+S1*B(IB+I))-(C3*A(ID+I)+S3*B(ID+I)) - B0=B(IA+I)+(C2*B(IC+I)-S2*A(IC+I)) - B2=B(IA+I)-(C2*B(IC+I)-S2*A(IC+I)) - B1=(C1*B(IB+I)-S1*A(IB+I))+(C3*B(ID+I)-S3*A(ID+I)) - B3=(C1*B(IB+I)-S1*A(IB+I))-(C3*B(ID+I)-S3*A(ID+I)) - C(JA+J)=A0+A1 - C(JC+J)=A0-A1 - D(JA+J)=B0+B1 - D(JC+J)=B1-B0 - C(JB+J)=A2+B3 - C(JD+J)=A2-B3 - D(JB+J)=B2-A3 - D(JD+J)=-(B2+A3) - I=I+INC3 - J=J+INC4 - 430 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 440 CONTINUE - IBASE=IBASE+IJUMP - JA=JA+JINK - JB=JB+JINK - JC=JC-JINK - JD=JD-JINK - 450 CONTINUE - ENDIF -! - IF (JB.EQ.JC) THEN - SIN45=SQRT(0.5_JPRB) - JBASE=0 - DO 480 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 470 IJK=1,ILOT - C(JA+J)=A(IA+I)+SIN45*(A(IB+I)-A(ID+I)) - C(JB+J)=A(IA+I)-SIN45*(A(IB+I)-A(ID+I)) - D(JA+J)=-A(IC+I)-SIN45*(A(IB+I)+A(ID+I)) - D(JB+J)=A(IC+I)-SIN45*(A(IB+I)+A(ID+I)) - I=I+INC3 - J=J+INC4 - 470 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 480 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - Z=1.0_JPRB/REAL(N,KIND=JPRB) - IF (LIPL) THEN - DO 494 L=1,ILA - I=IBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 492 IJK=1,ILOT - T1=Z*(A(IA+I)-A(IC+I)) - T3=Z*(A(ID+I)-A(IB+I)) - T2=Z*((A(IA+I)+A(IC+I))-(A(IB+I)+A(ID+I))) - A(IA+I)=Z*((A(IA+I)+A(IC+I))+(A(IB+I)+A(ID+I))) - A(IB+I)=T1 - A(IC+I)=T2 - A(ID+I)=T3 - I=I+INC3 - 492 CONTINUE - IBASE=IBASE+INC11 - 494 CONTINUE - ELSE - DO 498 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 496 IJK=1,ILOT - C(JA+J)=Z*((A(IA+I)+A(IC+I))+(A(IB+I)+A(ID+I))) - C(JC+J)=Z*((A(IA+I)+A(IC+I))-(A(IB+I)+A(ID+I))) - C(JB+J)=Z*(A(IA+I)-A(IC+I)) - D(JB+J)=Z*(A(ID+I)-A(IB+I)) - I=I+INC3 - J=J+INC4 - 496 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 498 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.5) THEN -! -! CODING FOR FACTOR 5 -! ------------------- - 500 CONTINUE - IA=1 - IB=IA+IINK - IC=IB+IINK - ID=IC+IINK - IE=ID+IINK - JA=1 - JB=JA+(2*M-LA)*INC2 - JC=JB+2*M*INC2 - JD=JC - JE=JB -! - IF (LA.NE.M) THEN -! - DO 520 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 510 IJK=1,ILOT - A1=A(IB+I)+A(IE+I) - A3=A(IB+I)-A(IE+I) - A2=A(IC+I)+A(ID+I) - A4=A(IC+I)-A(ID+I) - A5=A(IA+I)-0.25_JPRB*(A1+A2) - A6=QRT5*(A1-A2) - C(JA+J)=A(IA+I)+(A1+A2) - C(JB+J)=A5+A6 - C(JC+J)=A5-A6 - D(JB+J)=-SIN72*A3-SIN36*A4 - D(JC+J)=-SIN36*A3+SIN72*A4 - I=I+INC3 - J=J+INC4 - 510 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 520 CONTINUE - JA=JA+JINK - JINK=2*JINK - JB=JB+JINK - JC=JC+JINK - JD=JD-JINK - JE=JE-JINK - IBASE=IBASE+IJUMP - IJUMP=2*IJUMP+IINK -! - IF (JB.LT.JD) THEN - DO 550 K=LA,KSTOP,LA - KB=K+K - KC=KB+KB - KD=KC+KB - KE=KD+KB - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - C2=TRIGS(KC+1) - S2=TRIGS(KC+2) - C3=TRIGS(KD+1) - S3=TRIGS(KD+2) - C4=TRIGS(KE+1) - S4=TRIGS(KE+2) - JBASE=0 - DO 540 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 530 IJK=1,ILOT - A1=(C1*A(IB+I)+S1*B(IB+I))+(C4*A(IE+I)+S4*B(IE+I)) - A3=(C1*A(IB+I)+S1*B(IB+I))-(C4*A(IE+I)+S4*B(IE+I)) - A2=(C2*A(IC+I)+S2*B(IC+I))+(C3*A(ID+I)+S3*B(ID+I)) - A4=(C2*A(IC+I)+S2*B(IC+I))-(C3*A(ID+I)+S3*B(ID+I)) - B1=(C1*B(IB+I)-S1*A(IB+I))+(C4*B(IE+I)-S4*A(IE+I)) - B3=(C1*B(IB+I)-S1*A(IB+I))-(C4*B(IE+I)-S4*A(IE+I)) - B2=(C2*B(IC+I)-S2*A(IC+I))+(C3*B(ID+I)-S3*A(ID+I)) - B4=(C2*B(IC+I)-S2*A(IC+I))-(C3*B(ID+I)-S3*A(ID+I)) - A5=A(IA+I)-0.25_JPRB*(A1+A2) - A6=QRT5*(A1-A2) - B5=B(IA+I)-0.25_JPRB*(B1+B2) - B6=QRT5*(B1-B2) - A10=A5+A6 - A20=A5-A6 - B10=B5+B6 - B20=B5-B6 - A11=SIN72*B3+SIN36*B4 - A21=SIN36*B3-SIN72*B4 - B11=SIN72*A3+SIN36*A4 - B21=SIN36*A3-SIN72*A4 - C(JA+J)=A(IA+I)+(A1+A2) - C(JB+J)=A10+A11 - C(JE+J)=A10-A11 - C(JC+J)=A20+A21 - C(JD+J)=A20-A21 - D(JA+J)=B(IA+I)+(B1+B2) - D(JB+J)=B10-B11 - D(JE+J)=-(B10+B11) - D(JC+J)=B20-B21 - D(JD+J)=-(B20+B21) - I=I+INC3 - J=J+INC4 - 530 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 540 CONTINUE - IBASE=IBASE+IJUMP - JA=JA+JINK - JB=JB+JINK - JC=JC+JINK - JD=JD-JINK - JE=JE-JINK - 550 CONTINUE - ENDIF -! - IF (JB.EQ.JD) THEN - JBASE=0 - DO 580 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 570 IJK=1,ILOT - A1=A(IB+I)+A(IE+I) - A3=A(IB+I)-A(IE+I) - A2=A(IC+I)+A(ID+I) - A4=A(IC+I)-A(ID+I) - A5=A(IA+I)+0.25_JPRB*(A3-A4) - A6=QRT5*(A3+A4) - C(JA+J)=A5+A6 - C(JB+J)=A5-A6 - C(JC+J)=A(IA+I)-(A3-A4) - D(JA+J)=-SIN36*A1-SIN72*A2 - D(JB+J)=-SIN72*A1+SIN36*A2 - I=I+INC3 - J=J+INC4 - 570 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 580 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - Z=1.0_JPRB/REAL(N,KIND=JPRB) - ZQRT5=Z*QRT5 - ZSIN36=Z*SIN36 - ZSIN72=Z*SIN72 - IF (LIPL) THEN - DO 594 L=1,ILA - I=IBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 592 IJK=1,ILOT - A1=A(IB+I)+A(IE+I) - A3=A(IB+I)-A(IE+I) - A2=A(IC+I)+A(ID+I) - A4=A(IC+I)-A(ID+I) - A5=Z*(A(IA+I)-0.25_JPRB*(A1+A2)) - A6=ZQRT5*(A1-A2) - A(IA+I)=Z*(A(IA+I)+(A1+A2)) - A(IB+I)=A5+A6 - A(ID+I)=A5-A6 - A(IC+I)=-ZSIN72*A3-ZSIN36*A4 - A(IE+I)=-ZSIN36*A3+ZSIN72*A4 - I=I+INC3 - 592 CONTINUE - IBASE=IBASE+INC11 - 594 CONTINUE - ELSE - DO 598 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 596 IJK=1,ILOT - A1=A(IB+I)+A(IE+I) - A3=A(IB+I)-A(IE+I) - A2=A(IC+I)+A(ID+I) - A4=A(IC+I)-A(ID+I) - A5=Z*(A(IA+I)-0.25_JPRB*(A1+A2)) - A6=ZQRT5*(A1-A2) - C(JA+J)=Z*(A(IA+I)+(A1+A2)) - C(JB+J)=A5+A6 - C(JC+J)=A5-A6 - D(JB+J)=-ZSIN72*A3-ZSIN36*A4 - D(JC+J)=-ZSIN36*A3+ZSIN72*A4 - I=I+INC3 - J=J+INC4 - 596 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 598 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.6) THEN -! -! CODING FOR FACTOR 6 -! ------------------- - 600 CONTINUE - IA=1 - IB=IA+IINK - IC=IB+IINK - ID=IC+IINK - IE=ID+IINK - IF=IE+IINK - JA=1 - JB=JA+(2*M-LA)*INC2 - JC=JB+2*M*INC2 - JD=JC+2*M*INC2 - JE=JC - JF=JB -! - IF (LA.NE.M) THEN -! - DO 620 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 610 IJK=1,ILOT - A11=(A(IC+I)+A(IF+I))+(A(IB+I)+A(IE+I)) - C(JA+J)=(A(IA+I)+A(ID+I))+A11 - C(JC+J)=(A(IA+I)+A(ID+I)-0.5_JPRB*A11) - D(JC+J)=SIN60*((A(IC+I)+A(IF+I))-(A(IB+I)+A(IE+I))) - A11=(A(IC+I)-A(IF+I))+(A(IE+I)-A(IB+I)) - C(JB+J)=(A(IA+I)-A(ID+I))-0.5_JPRB*A11 - D(JB+J)=SIN60*((A(IE+I)-A(IB+I))-(A(IC+I)-A(IF+I))) - C(JD+J)=(A(IA+I)-A(ID+I))+A11 - I=I+INC3 - J=J+INC4 - 610 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 620 CONTINUE - JA=JA+JINK - JINK=2*JINK - JB=JB+JINK - JC=JC+JINK - JD=JD-JINK - JE=JE-JINK - JF=JF-JINK - IBASE=IBASE+IJUMP - IJUMP=2*IJUMP+IINK -! - IF (JC.LT.JD) THEN - DO 650 K=LA,KSTOP,LA - KB=K+K - KC=KB+KB - KD=KC+KB - KE=KD+KB - KF=KE+KB - C1=TRIGS(KB+1) - S1=TRIGS(KB+2) - C2=TRIGS(KC+1) - S2=TRIGS(KC+2) - C3=TRIGS(KD+1) - S3=TRIGS(KD+2) - C4=TRIGS(KE+1) - S4=TRIGS(KE+2) - C5=TRIGS(KF+1) - S5=TRIGS(KF+2) - JBASE=0 - DO 640 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 630 IJK=1,ILOT - A1=C1*A(IB+I)+S1*B(IB+I) - B1=C1*B(IB+I)-S1*A(IB+I) - A2=C2*A(IC+I)+S2*B(IC+I) - B2=C2*B(IC+I)-S2*A(IC+I) - A3=C3*A(ID+I)+S3*B(ID+I) - B3=C3*B(ID+I)-S3*A(ID+I) - A4=C4*A(IE+I)+S4*B(IE+I) - B4=C4*B(IE+I)-S4*A(IE+I) - A5=C5*A(IF+I)+S5*B(IF+I) - B5=C5*B(IF+I)-S5*A(IF+I) - A11=(A2+A5)+(A1+A4) - A20=(A(IA+I)+A3)-0.5_JPRB*A11 - A21=SIN60*((A2+A5)-(A1+A4)) - B11=(B2+B5)+(B1+B4) - B20=(B(IA+I)+B3)-0.5_JPRB*B11 - B21=SIN60*((B2+B5)-(B1+B4)) - C(JA+J)=(A(IA+I)+A3)+A11 - D(JA+J)=(B(IA+I)+B3)+B11 - C(JC+J)=A20-B21 - D(JC+J)=A21+B20 - C(JE+J)=A20+B21 - D(JE+J)=A21-B20 - A11=(A2-A5)+(A4-A1) - A20=(A(IA+I)-A3)-0.5_JPRB*A11 - A21=SIN60*((A4-A1)-(A2-A5)) - B11=(B5-B2)-(B4-B1) - B20=(B3-B(IA+I))-0.5_JPRB*B11 - B21=SIN60*((B5-B2)+(B4-B1)) - C(JB+J)=A20-B21 - D(JB+J)=A21-B20 - C(JD+J)=A11+(A(IA+I)-A3) - D(JD+J)=B11+(B3-B(IA+I)) - C(JF+J)=A20+B21 - D(JF+J)=A21+B20 - I=I+INC3 - J=J+INC4 - 630 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 640 CONTINUE - IBASE=IBASE+IJUMP - JA=JA+JINK - JB=JB+JINK - JC=JC+JINK - JD=JD-JINK - JE=JE-JINK - JF=JF-JINK - 650 CONTINUE - ENDIF -! - IF (JC.EQ.JD) THEN - JBASE=0 - DO 680 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 670 IJK=1,ILOT - C(JA+J)=(A(IA+I)+0.5_JPRB*(A(IC+I)-A(IE+I)))+ & - & SIN60*(A(IB+I)-A(IF+I)) - D(JA+J)=-(A(ID+I)+0.5_JPRB*(A(IB+I)+A(IF+I)))- & - & SIN60*(A(IC+I)+A(IE+I)) - C(JB+J)=A(IA+I)-(A(IC+I)-A(IE+I)) - D(JB+J)=A(ID+I)-(A(IB+I)+A(IF+I)) - C(JC+J)=(A(IA+I)+0.5_JPRB*(A(IC+I)-A(IE+I)))- & - & SIN60*(A(IB+I)-A(IF+I)) - D(JC+J)=-(A(ID+I)+0.5_JPRB*(A(IB+I)+ & - & A(IF+I)))+SIN60*(A(IC+I)+A(IE+I)) - I=I+INC3 - J=J+INC4 - 670 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 680 CONTINUE - ENDIF -! - ELSE !!! Case LA=M - Z=1.0_JPRB/REAL(N,KIND=JPRB) - ZSIN60=Z*SIN60 - IF (LIPL) THEN - DO 694 L=1,ILA - I=IBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 692 IJK=1,ILOT - A11=(A(IC+I)-A(IF+I))+(A(IE+I)-A(IB+I)) - T1=Z*((A(IA+I)-A(ID+I))-0.5_JPRB*A11) - T5=Z*((A(IA+I)-A(ID+I))+A11) - T2=ZSIN60*((A(IE+I)-A(IB+I))-(A(IC+I)-A(IF+I))) - T4=ZSIN60*((A(IC+I)+A(IF+I))-(A(IB+I)+A(IE+I))) - A11=(A(IC+I)+A(IF+I))+(A(IB+I)+A(IE+I)) - T3=Z*((A(IA+I)+A(ID+I))-0.5_JPRB*A11) - A(IA+I)=Z*((A(IA+I)+A(ID+I))+A11) - A(IB+I)=T1 - A(IC+I)=T2 - A(ID+I)=T3 - A(IE+I)=T4 - A(IF+I)=T5 - I=I+INC3 - 692 CONTINUE - IBASE=IBASE+INC11 - 694 CONTINUE - ELSE - DO 698 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 696 IJK=1,ILOT - A11=(A(IC+I)+A(IF+I))+(A(IB+I)+A(IE+I)) - C(JA+J)=Z*((A(IA+I)+A(ID+I))+A11) - C(JC+J)=Z*((A(IA+I)+A(ID+I))-0.5_JPRB*A11) - D(JC+J)=ZSIN60*((A(IC+I)+A(IF+I))-(A(IB+I)+A(IE+I))) - A11=(A(IC+I)-A(IF+I))+(A(IE+I)-A(IB+I)) - C(JB+J)=Z*((A(IA+I)-A(ID+I))-0.5_JPRB*A11) - D(JB+J)=ZSIN60*((A(IE+I)-A(IB+I))-(A(IC+I)-A(IF+I))) - C(JD+J)=Z*((A(IA+I)-A(ID+I))+A11) - I=I+INC3 - J=J+INC4 - 696 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 698 CONTINUE - ENDIF - ENDIF -! - ELSEIF (IFAC.EQ.8) THEN -! -! CODING FOR FACTOR 8 -! ------------------- - 800 CONTINUE - IF (LA.NE.M) THEN - IBAD=3 - ELSE - IA=1 - IB=IA+IINK - IC=IB+IINK - ID=IC+IINK - IE=ID+IINK - IF=IE+IINK - IG=IF+IINK - IH=IG+IINK - JA=1 - JB=JA+LA*INC2 - JC=JB+2*M*INC2 - JD=JC+2*M*INC2 - JE=JD+2*M*INC2 - Z=1.0_JPRB/REAL(N,KIND=JPRB) - ZSIN45=Z*SQRT(0.5_JPRB) -! - IF (LIPL) THEN - DO 820 L=1,ILA - I=IBASE -!OCL NOVREC -!DEC$ IVDEP -!NEC$ ivdep - DO 810 IJK=1,ILOT - T3=Z*((A(IA+I)+A(IE+I))-(A(IC+I)+A(IG+I))) - T4=Z*((A(ID+I)+A(IH+I))-(A(IB+I)+A(IF+I))) - T1=Z*(A(IA+I)-A(IE+I)) & - & +ZSIN45*((A(IH+I)-A(ID+I))-(A(IF+I)-A(IB+I))) - T5=Z*(A(IA+I)-A(IE+I)) & - & -ZSIN45*((A(IH+I)-A(ID+I))-(A(IF+I)-A(IB+I))) - T2=ZSIN45*((A(IH+I)-A(ID+I))+(A(IF+I)-A(IB+I))) & - & +Z*(A(IG+I)-A(IC+I)) - T6=ZSIN45*((A(IH+I)-A(ID+I))+(A(IF+I)-A(IB+I))) & - & -Z*(A(IG+I)-A(IC+I)) - T7=Z*(((A(IA+I)+A(IE+I))+(A(IC+I)+A(IG+I)))- & - & ((A(ID+I)+A(IH+I))+(A(IB+I)+A(IF+I)))) - A(IA+I)=Z*(((A(IA+I)+A(IE+I))+(A(IC+I)+A(IG+I)))+ & - & ((A(ID+I)+A(IH+I))+(A(IB+I)+A(IF+I)))) - A(IB+I)=T1 - A(IC+I)=T2 - A(ID+I)=T3 - A(IE+I)=T4 - A(IF+I)=T5 - A(IG+I)=T6 - A(IH+I)=T7 - I=I+INC3 - 810 CONTINUE - IBASE=IBASE+INC11 - 820 CONTINUE - ELSE - DO 840 L=1,ILA - I=IBASE - J=JBASE -!OCL NOVREC -!DEC$ IVDEP - DO 830 IJK=1,ILOT - C(JA+J)=Z*(((A(IA+I)+A(IE+I))+(A(IC+I)+A(IG+I)))+ & - & ((A(ID+I)+A(IH+I))+(A(IB+I)+A(IF+I)))) - C(JE+J)=Z*(((A(IA+I)+A(IE+I))+(A(IC+I)+A(IG+I)))- & - & ((A(ID+I)+A(IH+I))+(A(IB+I)+A(IF+I)))) - C(JC+J)=Z*((A(IA+I)+A(IE+I))-(A(IC+I)+A(IG+I))) - D(JC+J)=Z*((A(ID+I)+A(IH+I))-(A(IB+I)+A(IF+I))) - C(JB+J)=Z*(A(IA+I)-A(IE+I)) & - & +ZSIN45*((A(IH+I)-A(ID+I))-(A(IF+I)-A(IB+I))) - C(JD+J)=Z*(A(IA+I)-A(IE+I)) & - & -ZSIN45*((A(IH+I)-A(ID+I))-(A(IF+I)-A(IB+I))) - D(JB+J)=ZSIN45*((A(IH+I)-A(ID+I))+(A(IF+I)-A(IB+I))) & - & +Z*(A(IG+I)-A(IC+I)) - D(JD+J)=ZSIN45*((A(IH+I)-A(ID+I))+(A(IF+I)-A(IB+I))) & - & -Z*(A(IG+I)-A(IC+I)) - I=I+INC3 - J=J+INC4 - 830 CONTINUE - IBASE=IBASE+INC11 - JBASE=JBASE+INC2 - 840 CONTINUE - ENDIF -! - ENDIF -! - ELSE -! - IBAD=2 !!! Illegal factor -! - ENDIF -! -! RETURN -! ------ - 900 CONTINUE - IERR=IBAD - ENDSUBROUTINE QPASSF - - ENDSUBROUTINE FFT992 -#endif diff --git a/src/trans/algor/fft992_cc.F90 b/src/trans/algor/fft992_cc.F90 deleted file mode 100644 index 85d213b1..00000000 --- a/src/trans/algor/fft992_cc.F90 +++ /dev/null @@ -1,139 +0,0 @@ -! (C) Copyright 1998- ECMWF. -! (C) Copyright 2013- Meteo-France. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -SUBROUTINE FFT992_CC (A, KINC, KJUMP, KN, KLOT, KSIGN) -! -! Perform complex transforms with FFT992 like interface -! -! For KSIGN=-1 (Real -> Complex) call after of FFT992 -! For KSIGN=1 (Complex -> Real) call before FFT992 -! -USE PARKIND1, ONLY : JPIM, JPRB -IMPLICIT NONE -REAL(KIND=JPRB),INTENT(INOUT):: A(*) -INTEGER(KIND=JPIM),INTENT(IN):: KINC,KJUMP,KN,KLOT,KSIGN -REAL(KIND=JPRB),ALLOCATABLE :: ZWORK(:,:) -REAL(KIND=JPRB),ALLOCATABLE :: ZWORK1(:,:) -REAL(KIND=JPRB),ALLOCATABLE :: ZWORK2(:,:) -REAL(KIND=JPRB) :: ZN -INTEGER(KIND=JPIM) :: NH, NLOTH, I1, INCD, JLOT, I2, I1P, I2P, J, I1N, I2N - -NH=KN/2 -NLOTH=KLOT/2 -I1=1 -INCD=KINC*2 -ZN=SQRT(REAL(KN,JPRB)) - - -IF( KSIGN==-1)THEN - - IF( KJUMP /= 1 )THEN - ALLOCATE(ZWORK(1:2*KLOT,0:2*KN)) - DO JLOT=1,NLOTH - I2=I1+1 - I1P=I1+KINC - I2P=I2+KINC - DO J=0,NH - ZWORK(1,J)=A(I1+INCD*J)-A(I2P+INCD*J) - ZWORK(2,J)=A(I2+INCD*J)+A(I1P+INCD*J) - ZWORK(1,KN-J)=A(I1+INCD*J)+A(I2P+INCD*J) - ZWORK(2,KN-J)=A(I2+INCD*J)-A(I1P+INCD*J) - ENDDO - ! M normalization requires sqrt(M) ?? - DO J=0,KN-1 - A(I1+KINC*J)=ZWORK(1,J)*ZN - A(I2+KINC*J)=ZWORK(2,J)*ZN - ENDDO - I1=I1+KJUMP*2 - ENDDO - DEALLOCATE(ZWORK) - ELSE - ALLOCATE(ZWORK1(1:NLOTH,0:KN)) - ALLOCATE(ZWORK2(1:NLOTH,0:KN)) - DO J=0,NH - DO JLOT=1,NLOTH - ZWORK1(JLOT, J)=A((JLOT-1)*2+1+INCD*J)-A((JLOT-1)*2+2+KINC+INCD*J) - ZWORK2(JLOT, J)=A((JLOT-1)*2+2+INCD*J)+A((JLOT-1)*2+1+KINC+INCD*J) - ZWORK1(JLOT,KN-J)=A((JLOT-1)*2+1+INCD*J)+A((JLOT-1)*2+2+KINC+INCD*J) - ZWORK2(JLOT,KN-J)=A((JLOT-1)*2+2+INCD*J)-A((JLOT-1)*2+1+KINC+INCD*J) - ENDDO - ENDDO - DO J=0,KN-1 - DO JLOT=1,NLOTH - A((JLOT-1)*2+1+J*KINC)=ZWORK1(JLOT,J)*ZN - ENDDO - DO JLOT=1,NLOTH - A((JLOT-1)*2+2+J*KINC)=ZWORK2(JLOT,J)*ZN - ENDDO - ENDDO - DEALLOCATE(ZWORK1,ZWORK2) - ENDIF - -ELSE - - IF( KJUMP /= 1 )THEN - ALLOCATE(ZWORK(1:2*KLOT,0:2*KN)) - DO JLOT=1,NLOTH - I2=I1+1 - I1N=I1+KN*KINC - I2N=I2+KN*KINC - DO J=1,NH-1 - ZWORK(1,2*J)=0.5D0*(A(I1+KINC*J)+A(I1N-KINC*J)) - ZWORK(2,2*J)=0.5D0*(A(I2+KINC*J)+A(I2N-KINC*J)) - ZWORK(1,2*J+1)=0.5D0*(A(I2+KINC*J)-A(I2N-KINC*J)) - ZWORK(2,2*J+1)=0.5D0*(A(I1N-KINC*J)-A(I1+KINC*J)) - ENDDO - ZWORK(1,0)=A(I1) - ZWORK(2,0)=A(I2) - ZWORK(1,1)=0.0D0 - ZWORK(2,1)=0.0D0 - ZWORK(1,KN)=A(I1N) - ZWORK(2,KN)=A(I2N) - ZWORK(1,KN+1)=0.0D0 - ZWORK(2,KN+1)=0.0D0 - DO J=0,KN+1 - A(I1+KINC*J)=ZWORK(1,J) - A(I2+KINC*J)=ZWORK(2,J) - ENDDO - I1=I1+KJUMP*2 - ENDDO - DEALLOCATE(ZWORK) - ELSE - ALLOCATE(ZWORK1(1:NLOTH,0:KN+1)) - ALLOCATE(ZWORK2(1:NLOTH,0:KN+1)) - DO J=1,NH-1 - DO JLOT=1,NLOTH - ZWORK1(JLOT,2*J )=0.5D0*(A((JLOT-1)*2+1+KINC*J)+A((JLOT-1)*2+1+(KN-J)*KINC)) - ZWORK2(JLOT,2*J )=0.5D0*(A((JLOT-1)*2+2+KINC*J)+A((JLOT-1)*2+2+(KN-J)*KINC)) - ZWORK1(JLOT,2*J+1)=0.5D0*(A((JLOT-1)*2+2+KINC*J)-A((JLOT-1)*2+2+(KN-J)*KINC)) - ZWORK2(JLOT,2*J+1)=0.5D0*(A((JLOT-1)*2+1+(KN-J)*KINC)-A((JLOT-1)*2+1+KINC*J)) - ZWORK1(JLOT,0 )=A((JLOT-1)*2+1) - ZWORK2(JLOT,0 )=A((JLOT-1)*2+2) - ZWORK1(JLOT,1 )=0.0D0 - ZWORK2(JLOT,1 )=0.0D0 - ZWORK1(JLOT,KN )=A((JLOT-1)*2+1+KN*KINC) - ZWORK2(JLOT,KN )=A((JLOT-1)*2+2+KN*KINC) - ZWORK1(JLOT,KN+1 )=0.0D0 - ZWORK2(JLOT,KN+1 )=0.0D0 - ENDDO - ENDDO - DO J=0,KN+1 - DO JLOT=1,NLOTH - A((JLOT-1)*2+1+KINC*J)=ZWORK1(JLOT,J) - A((JLOT-1)*2+2+KINC*J)=ZWORK2(JLOT,J) - ENDDO - ENDDO - DEALLOCATE(ZWORK1,ZWORK2) - ENDIF - -ENDIF - -RETURN -END SUBROUTINE FFT992_CC diff --git a/src/trans/algor/set99.F90 b/src/trans/algor/set99.F90 deleted file mode 100644 index 8065a674..00000000 --- a/src/trans/algor/set99.F90 +++ /dev/null @@ -1,82 +0,0 @@ -! (C) Copyright 1998- ECMWF. -! (C) Copyright 2013- Meteo-France. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - - SUBROUTINE SET99(TRIGS,IFAX,N) -!AUTOPROMOTE - USE PARKIND1, ONLY : JPIM, JPRB - USE YOMHOOK , ONLY : LHOOK, DR_HOOK, JPHOOK -! - IMPLICIT NONE -! - INTEGER(KIND=JPIM) :: N - INTEGER(KIND=JPIM) :: I,IFAC,IL,IXXX,K,NHL,NIL,NFAX,NU - REAL(KIND=JPRB) :: ANGLE,DEL - REAL(KIND=JPRB) :: TRIGS(N) - INTEGER(KIND=JPIM) :: IFAX(*) - INTEGER(KIND=JPIM) :: JFAX(10),NLFAX(7) -! -! SUBROUTINE 'SET99' - COMPUTES FACTORS OF N & TRIGONOMETRIC -! FUNCTIONS REQUIRED BY FFT99 & FFT991 -! - SAVE NLFAX -! - DATA NLFAX/6,8,5,4,3,2,1/ -! - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - IF (LHOOK) CALL DR_HOOK('SET99',0,ZHOOK_HANDLE) - IXXX=1 -! - DEL=4.0_JPRB * ASIN(1.0_JPRB)/REAL(N,KIND=JPRB) - NIL=0 - NHL=(N/2)-1 - DO 10 K=NIL,NHL - ANGLE=REAL(K,KIND=JPRB)*DEL - TRIGS(2*K+1)=COS(ANGLE) - TRIGS(2*K+2)=SIN(ANGLE) - 10 CONTINUE -! -! FIND FACTORS OF N (8,6,5,4,3,2; ONLY ONE 8 ALLOWED) -! LOOK FOR SIXES FIRST, STORE FACTORS IN DESCENDING ORDER - NU=N - IFAC=6 - K=0 - IL=1 - 20 CONTINUE - IF (MOD(NU,IFAC).NE.0) GO TO 30 - K=K+1 - JFAX(K)=IFAC - IF (IFAC.NE.8) GO TO 25 - IF (K.EQ.1) GO TO 25 - JFAX(1)=8 - JFAX(K)=6 - 25 CONTINUE - NU=NU/IFAC - IF (NU.EQ.1) GO TO 50 - IF (IFAC.NE.8) GO TO 20 - 30 CONTINUE - IL=IL+1 - IFAC=NLFAX(IL) - IF (IFAC.GT.1) GO TO 20 -! - WRITE(6,40) N - 40 FORMAT(4H1N =,I4,27H - CONTAINS ILLEGAL FACTORS) - IF (LHOOK) CALL DR_HOOK('SET99',1,ZHOOK_HANDLE) - RETURN -! -! NOW REVERSE ORDER OF FACTORS - 50 CONTINUE - NFAX=K - IFAX(1)=NFAX - DO 60 I=1,NFAX - IFAX(NFAX+2-I)=JFAX(I) - 60 CONTINUE - IFAX(10)=N - IF (LHOOK) CALL DR_HOOK('SET99',1,ZHOOK_HANDLE) - ENDSUBROUTINE SET99 diff --git a/src/trans/algor/set99b.F90 b/src/trans/algor/set99b.F90 deleted file mode 100644 index 6aafca71..00000000 --- a/src/trans/algor/set99b.F90 +++ /dev/null @@ -1,81 +0,0 @@ -! (C) Copyright 1998- ECMWF. -! (C) Copyright 2013- Meteo-France. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - - SUBROUTINE SET99B(TRIGS,IFAX,N,LDUSEFFT992) -!AUTOPROMOTE - USE PARKIND1, ONLY : JPIM, JPRB -! - IMPLICIT NONE -! - INTEGER(KIND=JPIM),INTENT(IN) :: N - REAL(KIND=JPRB),INTENT(OUT) :: TRIGS(N) - INTEGER(KIND=JPIM),INTENT(OUT) :: IFAX(*) - LOGICAL,INTENT(OUT) :: LDUSEFFT992 - - INTEGER(KIND=JPIM) :: I,IFAC,IL,IXXX,K,NHL,NIL,NFAX,NU - REAL(KIND=JPRB) :: ANGLE,DEL - INTEGER(KIND=JPIM) :: JFAX(10),NLFAX(7) -! -! SUBROUTINE 'SET99B' - COMPUTES FACTORS OF N & TRIGONOMETRIC -! FUNCTIONS REQUIRED BY FFT992. -! BASED ON SET99, SET99B ALSO RETURNS VIA LUSEFFT992 WHETHER -! FACTORS HAVE BEEN FOUND THAT CAN PERMIT (OR NOT) FFT992 TO BE USED. -! - SAVE NLFAX -! - DATA NLFAX/6,8,5,4,3,2,1/ -! - IXXX=1 -! - DEL=4.0_JPRB * ASIN(1.0_JPRB)/REAL(N,KIND=JPRB) - NIL=0 - NHL=(N/2)-1 - DO 10 K=NIL,NHL - ANGLE=REAL(K,KIND=JPRB)*DEL - TRIGS(2*K+1)=COS(ANGLE) - TRIGS(2*K+2)=SIN(ANGLE) - 10 CONTINUE -! -! FIND FACTORS OF N (8,6,5,4,3,2; ONLY ONE 8 ALLOWED) -! LOOK FOR SIXES FIRST, STORE FACTORS IN DESCENDING ORDER - NU=N - IFAC=6 - K=0 - IL=1 - 20 CONTINUE - IF (MOD(NU,IFAC).NE.0) GO TO 30 - K=K+1 - JFAX(K)=IFAC - IF (IFAC.NE.8) GO TO 25 - IF (K.EQ.1) GO TO 25 - JFAX(1)=8 - JFAX(K)=6 - 25 CONTINUE - NU=NU/IFAC - IF (NU.EQ.1) GO TO 50 - IF (IFAC.NE.8) GO TO 20 - 30 CONTINUE - IL=IL+1 - IFAC=NLFAX(IL) - IF (IFAC.GT.1) GO TO 20 -! - LDUSEFFT992=.FALSE. - RETURN -! -! NOW REVERSE ORDER OF FACTORS - 50 CONTINUE - NFAX=K - IFAX(1)=NFAX - DO 60 I=1,NFAX - IFAX(NFAX+2-I)=JFAX(I) - 60 CONTINUE - IFAX(10)=N - LDUSEFFT992=.TRUE. - END SUBROUTINE SET99B diff --git a/src/trans/sedrenames.txt b/src/trans/sedrenames.txt index 63896567..66e63f27 100644 --- a/src/trans/sedrenames.txt +++ b/src/trans/sedrenames.txt @@ -2,7 +2,6 @@ s/ASRE1_MOD/ASRE1_MOD_VARIANTDESIGNATOR/g s/ASRE1AD_MOD/ASRE1AD_MOD_VARIANTDESIGNATOR/g s/ASRE1B_MOD/ASRE1B_MOD_VARIANTDESIGNATOR/g s/ASRE1BAD_MOD/ASRE1BAD_MOD_VARIANTDESIGNATOR/g -s/BLUESTEIN_MOD/BLUESTEIN_MOD_VARIANTDESIGNATOR/g s/BUTTERFLY_ALG_MOD/BUTTERFLY_ALG_MOD_VARIANTDESIGNATOR/g s/CDMAP_MOD/CDMAP_MOD_VARIANTDESIGNATOR/g s/DEALLOC_RESOL_MOD/DEALLOC_RESOL_MOD_VARIANTDESIGNATOR/g @@ -96,8 +95,6 @@ s/READ_LEGPOL_MOD/READ_LEGPOL_MOD_VARIANTDESIGNATOR/g s/seefmm_mix/seefmm_mix_VARIANTDESIGNATOR/g s/SEEFMM_MIX/SEEFMM_MIX_VARIANTDESIGNATOR/g s/SET_RESOL_MOD/SET_RESOL_MOD_VARIANTDESIGNATOR/g -s/SET99( *($|\(| |\*))/SET99_VARIANTDESIGNATOR\1/g -s/SET99B/SET99B_VARIANTDESIGNATOR/g s/SETUP_DIMS_MOD/SETUP_DIMS_MOD_VARIANTDESIGNATOR/g s/SETUP_GEOM_MOD/SETUP_GEOM_MOD_VARIANTDESIGNATOR/g s/SETUP_TRANS( *($|\(| |\*))/SETUP_TRANS_VARIANTDESIGNATOR\1/g From 057589d999d2a4fd7fb1ff7054f441150d873419 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 8 May 2024 09:47:06 +0200 Subject: [PATCH 28/48] Move IF statement out of Fourier latitude loop Also remove unnecessary duplicate of JGL. --- src/trans/internal/ftdir_ctl_mod.F90 | 29 +++++++++++++------------- src/trans/internal/ftdir_ctlad_mod.F90 | 27 ++++++++++++------------ 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/src/trans/internal/ftdir_ctl_mod.F90 b/src/trans/internal/ftdir_ctl_mod.F90 index 2f53e523..c102848f 100644 --- a/src/trans/internal/ftdir_ctl_mod.F90 +++ b/src/trans/internal/ftdir_ctl_mod.F90 @@ -89,7 +89,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & REAL(KIND=JPRB),TARGET, ALLOCATABLE :: ZGTF_HEAP(:,:) REAL(KIND=JPRB),POINTER, CONTIGUOUS :: ZGTF(:,:) -INTEGER(KIND=JPIM) :: IST,JGL,IGL,IBLEN +INTEGER(KIND=JPIM) :: IST,JGL,IBLEN INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) INTEGER(KIND=JPIM) :: IVSET(KF_GP) @@ -186,20 +186,21 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & IINC=-1 ENDIF -CALL GSTATS(1640,0) -!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,IGL) -DO JGL=IBEG,IEND,IINC - IGL = JGL - IF(KF_FS>0) THEN - CALL FTDIR(ZGTF,KF_FS,IGL) - ENDIF - -! Save Fourier data in FOUBUF_IN +CALL GSTATS(1640, 0) +! If this rank has any Fourier fields, Fourier transform them +IF (KF_FS > 0) THEN + !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL) + DO JGL = IBEG, IEND, IINC + ! Fourier transform + CALL FTDIR(ZGTF, KF_FS, JGL) + + ! Save Fourier data in FOUBUF_IN + CALL FOURIER_OUT(ZGTF, KF_FS, JGL) + ENDDO + !$OMP END PARALLEL DO +ENDIF +CALL GSTATS(1640, 1) - CALL FOURIER_OUT(ZGTF,KF_FS,IGL) -ENDDO -!$OMP END PARALLEL DO -CALL GSTATS(1640,1) CALL GSTATS(106,1) ! ------------------------------------------------------------------ diff --git a/src/trans/internal/ftdir_ctlad_mod.F90 b/src/trans/internal/ftdir_ctlad_mod.F90 index e748d787..0843530c 100644 --- a/src/trans/internal/ftdir_ctlad_mod.F90 +++ b/src/trans/internal/ftdir_ctlad_mod.F90 @@ -123,20 +123,21 @@ SUBROUTINE FTDIR_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & IINC=-1 ENDIF -CALL GSTATS(1642,0) -!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,IGL) -DO JGL=IBEG,IEND,IINC - IGL = JGL - CALL FOURIER_OUTAD(ZGTF,KF_FS,IGL) - -! Fourier transform +CALL GSTATS(1642, 0) +! If this rank has any Fourier fields, Fourier transform them +IF (KF_FS > 0) THEN + !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL) + DO JGL = IBEG, IEND, IINC + ! Copy out Fourier data from FOUBUF_IN + CALL FOURIER_OUTAD(ZGTF, KF_FS, JGL) + + ! Fourier transform + CALL FTDIRAD(ZGTF, KF_FS, JGL) + ENDDO + !$OMP END PARALLEL DO +ENDIF +CALL GSTATS(1642, 1) - IF(KF_FS>0) THEN - CALL FTDIRAD(ZGTF,KF_FS,IGL) - ENDIF -ENDDO -!$OMP END PARALLEL DO -CALL GSTATS(1642,1) CALL GSTATS(133,1) ! Transposition From 01b7dadd9220cb88818a6567143358ce60fe6849 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 8 May 2024 10:20:58 +0200 Subject: [PATCH 29/48] Remove north-south/south-north mirrored latitude looping Apparently this is unnecessary, so we remove it, and all ranks will go from north-south. --- src/trans/internal/ftdir_ctl_mod.F90 | 14 ++------------ src/trans/internal/ftdir_ctlad_mod.F90 | 14 ++------------ src/trans/internal/ftinv_ctl_mod.F90 | 14 ++------------ src/trans/internal/ftinv_ctlad_mod.F90 | 14 ++------------ 4 files changed, 8 insertions(+), 48 deletions(-) diff --git a/src/trans/internal/ftdir_ctl_mod.F90 b/src/trans/internal/ftdir_ctl_mod.F90 index c102848f..a256f77b 100644 --- a/src/trans/internal/ftdir_ctl_mod.F90 +++ b/src/trans/internal/ftdir_ctl_mod.F90 @@ -94,7 +94,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) INTEGER(KIND=JPIM) :: IVSET(KF_GP) INTEGER(KIND=JPIM) :: IFGP2,IFGP3A,IFGP3B,IOFF,J3 -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC ! ------------------------------------------------------------------ @@ -176,21 +175,12 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & ALLOCATE(FOUBUF_IN(MAX(1,IBLEN))) ENDIF -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - CALL GSTATS(1640, 0) ! If this rank has any Fourier fields, Fourier transform them IF (KF_FS > 0) THEN + ! Loop over latitudes !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL) - DO JGL = IBEG, IEND, IINC + DO JGL = 1, D%NDGL_FS ! Fourier transform CALL FTDIR(ZGTF, KF_FS, JGL) diff --git a/src/trans/internal/ftdir_ctlad_mod.F90 b/src/trans/internal/ftdir_ctlad_mod.F90 index 0843530c..29f4fa01 100644 --- a/src/trans/internal/ftdir_ctlad_mod.F90 +++ b/src/trans/internal/ftdir_ctlad_mod.F90 @@ -93,7 +93,6 @@ SUBROUTINE FTDIR_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & INTEGER(KIND=JPIM) :: IVSET(KF_GP) INTEGER(KIND=JPIM) :: JGL,IGL INTEGER(KIND=JPIM) :: IFGP2,IFGP3A,IFGP3B,IOFF,J3 -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC ! ------------------------------------------------------------------ @@ -113,21 +112,12 @@ SUBROUTINE FTDIR_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & ZGTF => ZGTF_HEAP(:,:) ENDIF -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - CALL GSTATS(1642, 0) ! If this rank has any Fourier fields, Fourier transform them IF (KF_FS > 0) THEN + ! Loop over latitudes !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL) - DO JGL = IBEG, IEND, IINC + DO JGL = 1, D%NDGL_FS ! Copy out Fourier data from FOUBUF_IN CALL FOURIER_OUTAD(ZGTF, KF_FS, JGL) diff --git a/src/trans/internal/ftinv_ctl_mod.F90 b/src/trans/internal/ftinv_ctl_mod.F90 index c438f8a6..b34aead9 100644 --- a/src/trans/internal/ftinv_ctl_mod.F90 +++ b/src/trans/internal/ftinv_ctl_mod.F90 @@ -101,7 +101,6 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) INTEGER(KIND=JPIM) :: IVSET(KF_GP) INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRB),POINTER :: ZUV(:,:) REAL(KIND=JPRB),POINTER :: ZSCALAR(:,:) @@ -169,19 +168,10 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& ENDIF ENDIF -IF (MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - CALL GSTATS(1639,0) +! Loop over latitudes !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,IGL) -DO JGL=IBEG,IEND,IINC +DO JGL = 1, D%NDGL_FS IGL = JGL CALL FOURIER_IN(ZGTF,KF_OUT_LT,IGL) diff --git a/src/trans/internal/ftinv_ctlad_mod.F90 b/src/trans/internal/ftinv_ctlad_mod.F90 index b79caddd..c7f0508a 100644 --- a/src/trans/internal/ftinv_ctlad_mod.F90 +++ b/src/trans/internal/ftinv_ctlad_mod.F90 @@ -114,7 +114,6 @@ SUBROUTINE FTINV_CTLAD(KF_UV_G,KF_SCALARS_G,& INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) INTEGER(KIND=JPIM) :: IVSET(KF_GP) INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC ! ------------------------------------------------------------------ @@ -266,19 +265,10 @@ SUBROUTINE FTINV_CTLAD(KF_UV_G,KF_SCALARS_G,& CALL GSTATS(132,0) -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - CALL GSTATS(1641,0) +! Loop over latitudes !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,IGL) -DO JGL=IBEG,IEND,IINC +DO JGL = 1, D%NDGL_FS IGL = JGL IF(KF_FS > 0) THEN CALL FTINVAD(ZGTF,KF_FS,IGL) From fa389c7a7cf86ed03d1a96f7c08a2a9287129d64 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Fri, 10 May 2024 10:35:02 +0100 Subject: [PATCH 30/48] Remove redundant variable IF_GPB --- src/trans/internal/dir_trans_ctl_mod.F90 | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/trans/internal/dir_trans_ctl_mod.F90 b/src/trans/internal/dir_trans_ctl_mod.F90 index ac2d6665..6c5dd62f 100644 --- a/src/trans/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/internal/dir_trans_ctl_mod.F90 @@ -118,22 +118,21 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& INTEGER(KIND=JPIM) :: IVSETUV(KF_GP),IVSETSC(KF_GP) INTEGER(KIND=JPIM) :: IBLKS,JBLK,ISTUV_G,IENUV_G INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP -INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB +INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV ! ------------------------------------------------------------------ ! Perform transform -IF_GPB = 2*KF_UV_G+KF_SCALARS_G -IF(NPROMATR > 0 .AND. IF_GPB > NPROMATR) THEN +IF(NPROMATR > 0 .AND. KF_GP > NPROMATR) THEN ! Fields to be split into packets CALL SHUFFLE(KF_UV_G,KF_SCALARS_G,ISHFUV_G,IVSETUV,ISHFSC_G,IVSETSC,& & KVSETUV,KVSETSC) - IBLKS=(IF_GPB-1)/NPROMATR+1 + IBLKS=(KF_GP-1)/NPROMATR+1 DO JBLK=1,IBLKS From 0beada720b1d91a9b7f4c082e36da91e8debb19e Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 14 May 2024 10:03:41 +0000 Subject: [PATCH 31/48] Tidy up and comment FOURIER_IN/OUT subroutines --- src/trans/internal/fourier_in_mod.F90 | 48 ++++++++++++++---------- src/trans/internal/fourier_inad_mod.F90 | 48 ++++++++++++++---------- src/trans/internal/fourier_out_mod.F90 | 47 +++++++++++++---------- src/trans/internal/fourier_outad_mod.F90 | 47 +++++++++++++---------- 4 files changed, 112 insertions(+), 78 deletions(-) diff --git a/src/trans/internal/fourier_in_mod.F90 b/src/trans/internal/fourier_in_mod.F90 index 67f936aa..58330974 100644 --- a/src/trans/internal/fourier_in_mod.F90 +++ b/src/trans/internal/fourier_in_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_IN_MOD CONTAINS -SUBROUTINE FOURIER_IN(PREEL,KFIELDS,KGL) +SUBROUTINE FOURIER_IN(PREEL, KFIELDS, KGL) !**** *FOURIER_IN* - Copy fourier data from buffer to local array @@ -24,6 +24,7 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS,KGL) ! Explicit arguments : PREEL - local fourier/GP array ! -------------------- KFIELDS - number of fields +! KGL - local index of latitude we are currently on ! ! Externals. None. ! ---------- @@ -38,37 +39,44 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS,KGL) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB - -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_TRANS ,ONLY : FOUBUF -USE TPM_GEOMETRY ,ONLY : G -! +USE PARKIND1, ONLY : JPIM, JPRB +USE TPM_DISTR, ONLY : D, MYSETW +USE TPM_TRANS, ONLY : FOUBUF +USE TPM_GEOMETRY, ONLY : G IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL +REAL(KIND=JPRB), INTENT(OUT) :: PREEL(:,:) +INTEGER(KIND=JPIM), INTENT(IN) :: KFIELDS +INTEGER(KIND=JPIM), INTENT(IN) :: KGL -REAL(KIND=JPRB), INTENT(OUT) :: PREEL(:,:) - -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA +INTEGER(KIND=JPIM) :: JM, JF, IGLG, IPROC, IR, II, ISTA ! ------------------------------------------------------------------ -IGLG = D%NPTRLS(MYSETW)+KGL-1 -DO JM=0,G%NMEN(IGLG) +! Determine global latitude index corresponding to local latitude index KGL +IGLG = D%NPTRLS(MYSETW) + KGL - 1 + +! Loop over all zonal wavenumbers relevant for this latitude +DO JM = 0, G%NMEN(IGLG) + ! Get the member of the W-set responsible for this zonal wavenumber in the "m" representation IPROC = D%NPROCM(JM) - IR = 2*JM+1+D%NSTAGTF(KGL) - II = 2*JM+2+D%NSTAGTF(KGL) - ISTA = (D%NSTAGT0B(D%MSTABF(IPROC))+D%NPNTGTB0(JM,KGL))*2*KFIELDS - DO JF=1,KFIELDS + + ! Compute offset in FFT work array PREEL corresponding to wavenumber JM and latitude KGL + IR = 2 * JM + 1 + D%NSTAGTF(KGL) + II = 2 * JM + 2 + D%NSTAGTF(KGL) + + ! Compute offset for extraction of the fields from the m-to-l transposition buffer, FOUBUF + ISTA = (D%NSTAGT0B(D%MSTABF(IPROC)) + D%NPNTGTB0(JM,KGL)) * 2 * KFIELDS + + ! Copy all fields from m-to-l transposition buffer to FFT work array + DO JF = 1, KFIELDS PREEL(JF,IR) = FOUBUF(ISTA+2*JF-1) - PREEL(JF,II) = FOUBUF(ISTA+2*JF ) + PREEL(JF,II) = FOUBUF(ISTA+2*JF) ENDDO ENDDO ! ------------------------------------------------------------------ END SUBROUTINE FOURIER_IN -END MODULE FOURIER_IN_MOD - +END MODULE FOURIER_IN_MOD \ No newline at end of file diff --git a/src/trans/internal/fourier_inad_mod.F90 b/src/trans/internal/fourier_inad_mod.F90 index 982e043f..6a6b982a 100644 --- a/src/trans/internal/fourier_inad_mod.F90 +++ b/src/trans/internal/fourier_inad_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_INAD_MOD CONTAINS -SUBROUTINE FOURIER_INAD(PREEL,KFIELDS,KGL) +SUBROUTINE FOURIER_INAD(PREEL, KFIELDS, KGL) !**** *FOURIER_INAD* - Copy fourier data from buffer to local array - adjoint @@ -24,6 +24,7 @@ SUBROUTINE FOURIER_INAD(PREEL,KFIELDS,KGL) ! Explicit arguments : PREEL - local fourier/GP array ! -------------------- KFIELDS - number of fields +! KGL - local index of latitude we are currently on ! ! Externals. None. ! ---------- @@ -38,37 +39,44 @@ SUBROUTINE FOURIER_INAD(PREEL,KFIELDS,KGL) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB - -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_TRANS ,ONLY : FOUBUF -USE TPM_GEOMETRY ,ONLY : G -! +USE PARKIND1, ONLY : JPIM, JPRB +USE TPM_DISTR, ONLY : D, MYSETW +USE TPM_TRANS, ONLY : FOUBUF +USE TPM_GEOMETRY, ONLY : G IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL +REAL(KIND=JPRB), INTENT(IN) :: PREEL(:,:) +INTEGER(KIND=JPIM), INTENT(IN) :: KFIELDS +INTEGER(KIND=JPIM), INTENT(IN) :: KGL -REAL(KIND=JPRB), INTENT(IN) :: PREEL(:,:) - -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA +INTEGER(KIND=JPIM) :: JM, JF, IGLG, IPROC, IR, II, ISTA ! ------------------------------------------------------------------ -IGLG = D%NPTRLS(MYSETW)+KGL-1 -DO JM=0,G%NMEN(IGLG) +! Determine global latitude index corresponding to local latitude index KGL +IGLG = D%NPTRLS(MYSETW) + KGL - 1 + +! Loop over all zonal wavenumbers relevant for this latitude +DO JM = 0, G%NMEN(IGLG) + ! Get the member of the W-set responsible for this zonal wavenumber in the "m" representation IPROC = D%NPROCM(JM) - IR = 2*JM+1+D%NSTAGTF(KGL) - II = 2*JM+2+D%NSTAGTF(KGL) - ISTA = (D%NSTAGT0B(D%MSTABF(IPROC))+D%NPNTGTB0(JM,KGL))*2*KFIELDS - DO JF=1,KFIELDS + + ! Compute offset in FFT work array PREEL corresponding to wavenumber JM and latitude KGL + IR = 2 * JM + 1 + D%NSTAGTF(KGL) + II = 2 * JM + 2 + D%NSTAGTF(KGL) + + ! Compute offset for insertion of the fields in the m-to-l transposition buffer, FOUBUF + ISTA = (D%NSTAGT0B(D%MSTABF(IPROC)) + D%NPNTGTB0(JM,KGL)) * 2 * KFIELDS + + ! Copy all fields from FFT work array to m-to-l transposition buffer + DO JF = 1, KFIELDS FOUBUF(ISTA+2*JF-1) = PREEL(JF,IR) - FOUBUF(ISTA+2*JF ) = PREEL(JF,II) + FOUBUF(ISTA+2*JF) = PREEL(JF,II) ENDDO ENDDO ! ------------------------------------------------------------------ END SUBROUTINE FOURIER_INAD -END MODULE FOURIER_INAD_MOD - +END MODULE FOURIER_INAD_MOD \ No newline at end of file diff --git a/src/trans/internal/fourier_out_mod.F90 b/src/trans/internal/fourier_out_mod.F90 index 9e80bcf5..c2d16f79 100644 --- a/src/trans/internal/fourier_out_mod.F90 +++ b/src/trans/internal/fourier_out_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_OUT_MOD CONTAINS -SUBROUTINE FOURIER_OUT(PREEL,KFIELDS,KGL) +SUBROUTINE FOURIER_OUT(PREEL, KFIELDS, KGL) !**** *FOURIER_OUT* - Copy fourier data from local array to buffer @@ -24,6 +24,7 @@ SUBROUTINE FOURIER_OUT(PREEL,KFIELDS,KGL) ! Explicit arguments : PREEL - local fourier/GP array ! -------------------- KFIELDS - number of fields +! KGL - local index of latitude we are currently on ! ! Externals. None. ! ---------- @@ -38,36 +39,44 @@ SUBROUTINE FOURIER_OUT(PREEL,KFIELDS,KGL) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB - -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_TRANS ,ONLY : FOUBUF_IN -USE TPM_GEOMETRY ,ONLY : G -! +USE PARKIND1, ONLY : JPIM, JPRB +USE TPM_DISTR, ONLY : D, MYSETW +USE TPM_TRANS, ONLY : FOUBUF_IN +USE TPM_GEOMETRY, ONLY : G IMPLICIT NONE -REAL(KIND=JPRB), INTENT(IN) :: PREEL(:,:) -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL +REAL(KIND=JPRB), INTENT(IN) :: PREEL(:,:) +INTEGER(KIND=JPIM), INTENT(IN) :: KFIELDS +INTEGER(KIND=JPIM), INTENT(IN) :: KGL -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA +INTEGER(KIND=JPIM) :: JM, JF, IGLG, IPROC, IR, II, ISTA ! ------------------------------------------------------------------ -IGLG = D%NPTRLS(MYSETW)+KGL-1 -DO JM=0,G%NMEN(IGLG) +! Determine global latitude index corresponding to local latitude index KGL +IGLG = D%NPTRLS(MYSETW) + KGL - 1 + +! Loop over all zonal wavenumbers relevant for this latitude +DO JM = 0, G%NMEN(IGLG) + ! Get the member of the W-set responsible for this zonal wavenumber in the "m" representation IPROC = D%NPROCM(JM) - IR = 2*JM+1+D%NSTAGTF(KGL) - II = 2*JM+2+D%NSTAGTF(KGL) - ISTA = (D%NSTAGT1B(D%MSTABF(IPROC))+D%NPNTGTB0(JM,KGL))*2*KFIELDS - DO JF=1,KFIELDS + + ! Compute offset in FFT work array PREEL corresponding to wavenumber JM and latitude KGL + IR = 2 * JM + 1 + D%NSTAGTF(KGL) + II = 2 * JM + 2 + D%NSTAGTF(KGL) + + ! Compute offset for insertion of the fields in the l-to-m transposition buffer, FOUBUF_IN + ISTA = (D%NSTAGT1B(D%MSTABF(IPROC)) + D%NPNTGTB0(JM,KGL)) * 2 * KFIELDS + + ! Copy all fields from FFT work array to l-to-m transposition buffer + DO JF = 1, KFIELDS FOUBUF_IN(ISTA+2*JF-1) = PREEL(JF,IR) - FOUBUF_IN(ISTA+2*JF ) = PREEL(JF,II) + FOUBUF_IN(ISTA+2*JF) = PREEL(JF,II) ENDDO ENDDO ! ------------------------------------------------------------------ END SUBROUTINE FOURIER_OUT -END MODULE FOURIER_OUT_MOD - +END MODULE FOURIER_OUT_MOD \ No newline at end of file diff --git a/src/trans/internal/fourier_outad_mod.F90 b/src/trans/internal/fourier_outad_mod.F90 index 884c3dd2..b135ba2f 100644 --- a/src/trans/internal/fourier_outad_mod.F90 +++ b/src/trans/internal/fourier_outad_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_OUTAD_MOD CONTAINS -SUBROUTINE FOURIER_OUTAD(PREEL,KFIELDS,KGL) +SUBROUTINE FOURIER_OUTAD(PREEL, KFIELDS, KGL) !**** *FOURIER_OUTAD* - Copy fourier data from local array to buffer - adjoint @@ -24,6 +24,7 @@ SUBROUTINE FOURIER_OUTAD(PREEL,KFIELDS,KGL) ! Explicit arguments : PREEL - local fourier/GP array ! -------------------- KFIELDS - number of fields +! KGL - local index of latitude we are currently on ! ! Externals. None. ! ---------- @@ -38,36 +39,44 @@ SUBROUTINE FOURIER_OUTAD(PREEL,KFIELDS,KGL) ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB - -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_TRANS ,ONLY : FOUBUF_IN -USE TPM_GEOMETRY ,ONLY : G -! +USE PARKIND1, ONLY : JPIM, JPRB +USE TPM_DISTR, ONLY : D, MYSETW +USE TPM_TRANS, ONLY : FOUBUF_IN +USE TPM_GEOMETRY, ONLY : G IMPLICIT NONE -REAL(KIND=JPRB), INTENT(OUT) :: PREEL(:,:) -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL +REAL(KIND=JPRB), INTENT(OUT) :: PREEL(:,:) +INTEGER(KIND=JPIM), INTENT(IN) :: KFIELDS +INTEGER(KIND=JPIM), INTENT(IN) :: KGL -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA +INTEGER(KIND=JPIM) :: JM, JF, IGLG, IPROC, IR, II, ISTA ! ------------------------------------------------------------------ -IGLG = D%NPTRLS(MYSETW)+KGL-1 -DO JM=0,G%NMEN(IGLG) +! Determine global latitude index corresponding to local latitude index KGL +IGLG = D%NPTRLS(MYSETW) + KGL - 1 + +! Loop over all zonal wavenumbers relevant for this latitude +DO JM = 0, G%NMEN(IGLG) + ! Get the member of the W-set responsible for this zonal wavenumber in the "m" representation IPROC = D%NPROCM(JM) - IR = 2*JM+1+D%NSTAGTF(KGL) - II = 2*JM+2+D%NSTAGTF(KGL) - ISTA = (D%NSTAGT1B(D%MSTABF(IPROC))+D%NPNTGTB0(JM,KGL))*2*KFIELDS - DO JF=1,KFIELDS + + ! Compute offset in FFT work array PREEL corresponding to wavenumber JM and latitude KGL + IR = 2 * JM + 1 + D%NSTAGTF(KGL) + II = 2 * JM + 2 + D%NSTAGTF(KGL) + + ! Compute offset for extraction of the fields from the l-to-m transposition buffer, FOUBUF, IN + ISTA = (D%NSTAGT1B(D%MSTABF(IPROC)) + D%NPNTGTB0(JM,KGL)) * 2 * KFIELDS + + ! Copy all fields from l-to-m transposition buffer to FFT work array + DO JF = 1, KFIELDS PREEL(JF,IR) = FOUBUF_IN(ISTA+2*JF-1) - PREEL(JF,II) = FOUBUF_IN(ISTA+2*JF ) + PREEL(JF,II) = FOUBUF_IN(ISTA+2*JF) ENDDO ENDDO ! ------------------------------------------------------------------ END SUBROUTINE FOURIER_OUTAD -END MODULE FOURIER_OUTAD_MOD - +END MODULE FOURIER_OUTAD_MOD \ No newline at end of file From 503fdc1f5200eae2f840e9ad15a4407106799f1e Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 15 May 2024 09:44:17 +0000 Subject: [PATCH 32/48] Delete unusued variable NLENGT1B --- src/trans/internal/sump_trans_mod.F90 | 1 - src/trans/internal/tpm_distr.F90 | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/trans/internal/sump_trans_mod.F90 b/src/trans/internal/sump_trans_mod.F90 index 7526556c..af46e0d1 100644 --- a/src/trans/internal/sump_trans_mod.F90 +++ b/src/trans/internal/sump_trans_mod.F90 @@ -159,7 +159,6 @@ SUBROUTINE SUMP_TRANS D%NSTAGT1B(JA) = (JA-1)*IAUX1 ENDDO D%NLENGT0B = IAUX0*NPRTRNS - D%NLENGT1B = IAUX1*NPRTRNS ENDIF ! GRIDPOINT SPACE diff --git a/src/trans/internal/tpm_distr.F90 b/src/trans/internal/tpm_distr.F90 index d9640baa..ad609979 100644 --- a/src/trans/internal/tpm_distr.F90 +++ b/src/trans/internal/tpm_distr.F90 @@ -107,7 +107,7 @@ MODULE TPM_DISTR INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCL(:) ! Process responsible for each lat. (F.S) INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPTRLS(:) ! Pointer to first lat. (F.S) -! NSTAGT0B to NLENGT1B: help arrays for spectral to fourier space transposition +! NSTAGT0B to NLENGT0B: help arrays for spectral to fourier space transposition INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTAGT0B(:) ! Start adresses for segments within buffer ! (according to processors to whom data ! is going to be sent) @@ -118,7 +118,6 @@ MODULE TPM_DISTR INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLTSGTB(:) INTEGER(KIND=JPIM) ,ALLOCATABLE :: MSTABF(:) INTEGER(KIND=JPIM) :: NLENGT0B ! dimension -INTEGER(KIND=JPIM) :: NLENGT1B ! dimension ! GRIDPOINT SPACE From 85321325d32bd442b8349ded1d083bb26467115f Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 14 May 2024 15:34:30 +0000 Subject: [PATCH 33/48] Add missing comments to describe indexing arrays in TPM_DISTR --- src/trans/internal/tpm_distr.F90 | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/trans/internal/tpm_distr.F90 b/src/trans/internal/tpm_distr.F90 index ad609979..8ada1a93 100644 --- a/src/trans/internal/tpm_distr.F90 +++ b/src/trans/internal/tpm_distr.F90 @@ -108,16 +108,28 @@ MODULE TPM_DISTR INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPTRLS(:) ! Pointer to first lat. (F.S) ! NSTAGT0B to NLENGT0B: help arrays for spectral to fourier space transposition -INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTAGT0B(:) ! Start adresses for segments within buffer - ! (according to processors to whom data - ! is going to be sent) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTAGT1B(:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPNTGTB0(:,:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPNTGTB1(:,:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLTSFTB(:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLTSGTB(:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: MSTABF(:) -INTEGER(KIND=JPIM) :: NLENGT0B ! dimension + +! For index I, offset from which to take data from send buffer of TRMTOL to be sent to processor I +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTAGT0B(:) ! (1:NPRTRW+1) +! For index I, offset at which to put data in receive buffer of TRLTOM for sending processor I +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTAGT1B(:) ! (1:NPRTRW+1) +! For wavenumber JM (first dimension) and latitude KGL (second dimension), this gives the offset +! into the TRLTOM/TRMTOL send/receive buffers (FOUBUF, FOUBUF_IN) for JM and KGL, starting from the +! offset for the processor (i.e. this must be used in combination with NSTAGT0B) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPNTGTB0(:,:) ! (0:R%NSMAX,D%NDGL_FS) + +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPNTGTB1(:,:) ! (D%NUMP,R%NDGL) +! For index I, this tells you how many values will be transferred from this processor to processor I +! in TRMTOL and from processor I to this processor in TRLTOM +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLTSFTB(:) ! (1:NPRTRW+1) +! For index I, this tells you how many values will be transferred from this processor to processor I +! in TRLTOM and from processor I to this processor in TRMTOL +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLTSGTB(:) ! (1:NPRTRW+1) +! For index I, this tells you from where in the TRLTOM send buffer to take the data to send to +! processor I +INTEGER(KIND=JPIM) ,ALLOCATABLE :: MSTABF(:) ! (1:NPRTRW+1) +! Size of FOUBUF_IN, FOUBUF, except for the fields (i.e. this will be multiplied by 2 * KFIELD) +INTEGER(KIND=JPIM) :: NLENGT0B ! GRIDPOINT SPACE From 6f6ad0742236c46881881987cb6318574b840e51 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Mon, 20 May 2024 15:30:31 +0000 Subject: [PATCH 34/48] Remove redundant IMPLICIT NONEs The parent scope (BUTTERFLY_ALG_MOD) has this already. --- src/trans/algor/butterfly_alg_mod.F90 | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/trans/algor/butterfly_alg_mod.F90 b/src/trans/algor/butterfly_alg_mod.F90 index 364bd411..f7b5584e 100644 --- a/src/trans/algor/butterfly_alg_mod.F90 +++ b/src/trans/algor/butterfly_alg_mod.F90 @@ -68,7 +68,6 @@ MODULE BUTTERFLY_ALG_MOD CONTAINS !================================================================================ SUBROUTINE CONSTRUCT_BUTTERFLY(PEPS,KCMAX,KM,KN,PMAT,YD_STRUCT) -IMPLICIT NONE ! Constuct butterfly @@ -264,7 +263,7 @@ SUBROUTINE CONSTRUCT_BUTTERFLY(PEPS,KCMAX,KM,KN,PMAT,YD_STRUCT) END SUBROUTINE CONSTRUCT_BUTTERFLY !============================================================================= SUBROUTINE PACK_BUTTERFLY_STRUCT(YD_STRUCT,YD_CLONE) -IMPLICIT NONE + ! Pack butterfly struct into array TYPE(BUTTERFLY_STRUCT),INTENT(IN) :: YD_STRUCT ! Structure needed to apply butterfly TYPE(CLONE), TARGET, INTENT(OUT) :: YD_CLONE ! for communicating packed bufferfly_structs @@ -362,7 +361,7 @@ SUBROUTINE PACK_BUTTERFLY_STRUCT(YD_STRUCT,YD_CLONE) END SUBROUTINE PACK_BUTTERFLY_STRUCT !===================================================================================== SUBROUTINE UNPACK_BUTTERFLY_STRUCT(YD_STRUCT,YD_CLONE,YDMEMBUF) -IMPLICIT NONE + ! Construct butterfly struct from packed array TYPE(BUTTERFLY_STRUCT),INTENT(OUT) :: YD_STRUCT ! Structure needed to apply butterfly TYPE(CLONE), TARGET, OPTIONAL,INTENT(IN) :: YD_CLONE ! for communicating packed bufferfly_structs @@ -479,7 +478,7 @@ SUBROUTINE UNPACK_BUTTERFLY_STRUCT(YD_STRUCT,YD_CLONE,YDMEMBUF) END SUBROUTINE UNPACK_BUTTERFLY_STRUCT !=========================================================================== SUBROUTINE EXTRACT_SUB(YDNODE,PMAT,PSUB) -IMPLICIT NONE + TYPE(NODE_TYPE),INTENT(IN) :: YDNODE REAL(KIND=JPRD),INTENT(IN) :: PMAT(:,:) REAL(KIND=JPRD),INTENT(OUT) :: PSUB(:,:) @@ -500,7 +499,7 @@ SUBROUTINE EXTRACT_SUB(YDNODE,PMAT,PSUB) END SUBROUTINE EXTRACT_SUB !=================================================================== SUBROUTINE COMBINE_B(PBL,KRANKL,PBR,KRANKR,KROWS,KOFFROW,PBCOMB) -IMPLICIT NONE + REAL(KIND=JPRD),INTENT(IN) :: PBL(:,:) INTEGER(KIND=JPIM),INTENT(IN) :: KRANKL REAL(KIND=JPRD),INTENT(IN) :: PBR(:,:) @@ -525,7 +524,7 @@ SUBROUTINE COMBINE_B(PBL,KRANKL,PBR,KRANKR,KROWS,KOFFROW,PBCOMB) END SUBROUTINE COMBINE_B !=================================================================== SUBROUTINE COMPRESS_MAT(YDNODE,YDBNODE,PEPS,KROWS,KCOLS,PSUB) -IMPLICIT NONE + TYPE(NODE_TYPE),INTENT(INOUT) :: YDNODE TYPE(NODE_TYPE),INTENT(INOUT) :: YDBNODE REAL(KIND=JPRD),INTENT(IN) :: PEPS @@ -564,7 +563,7 @@ SUBROUTINE COMPRESS_MAT(YDNODE,YDBNODE,PEPS,KROWS,KCOLS,PSUB) END SUBROUTINE COMPRESS_MAT !==================================================================== SUBROUTINE MULT_BUTV(CDTRANS,YD_STRUCT,PVECIN,PVECOUT) -IMPLICIT NONE + ! Multiply vector by matrix represented by buttervfly TYPE(BUTTERFLY_STRUCT),INTENT(IN) :: YD_STRUCT ! Structure from constucT-butterfly @@ -687,7 +686,6 @@ SUBROUTINE MULT_BUTV(CDTRANS,YD_STRUCT,PVECIN,PVECOUT) END SUBROUTINE MULT_BUTV !==================================================================== SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) -IMPLICIT NONE ! Multiply matrix by matrix represented by butterfly @@ -988,7 +986,6 @@ END SUBROUTINE MULT_BUTM !===================================================================== SUBROUTINE MULT_P(YDNODE,PVECIN,PVECOUT) ! Multiply vector by projection matrix -IMPLICIT NONE TYPE(NODE_TYPE),INTENT(INOUT) :: YDNODE REAL(KIND=JPRB),INTENT(IN) :: PVECIN(:) REAL(KIND=JPRB),INTENT(OUT) :: PVECOUT(:) @@ -1021,7 +1018,6 @@ SUBROUTINE MULT_P(YDNODE,PVECIN,PVECOUT) END SUBROUTINE MULT_P !===================================================================== SUBROUTINE MULT_PM(YDNODE,KF,KLBETA,PVECIN,PVECOUT) -IMPLICIT NONE ! Multiply matrix by projection matrix TYPE(NODE_TYPE),INTENT(INOUT) :: YDNODE INTEGER(KIND=JPIM),INTENT(IN) :: KF @@ -1056,7 +1052,6 @@ END SUBROUTINE MULT_PM !================================================================== SUBROUTINE MULT_P_TR(YDNODE,PVECIN,PVECOUT) ! Multiply vector by transposed procetion matrix -IMPLICIT NONE TYPE(NODE_TYPE),INTENT(INOUT) :: YDNODE REAL(KIND=JPRB),INTENT(IN) :: PVECIN(:) REAL(KIND=JPRB),INTENT(OUT) :: PVECOUT(:) @@ -1089,7 +1084,6 @@ END SUBROUTINE MULT_P_TR !================================================================== SUBROUTINE MULT_P_TRM(YDNODE,KF,PVECIN,PVECOUT) ! Multiply matrix by transposed procetion matrix -IMPLICIT NONE TYPE(NODE_TYPE),INTENT(INOUT) :: YDNODE INTEGER(KIND=JPIM),INTENT(IN) :: KF REAL(KIND=JPRB),INTENT(IN) :: PVECIN(:,:) From 2317dd9dc0a2034cabf1a42f9e406b1f734a48b9 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Mon, 20 May 2024 15:35:07 +0000 Subject: [PATCH 35/48] Use new precision-agnostic GEMM for MULT_P_TRM Also remove that copy to ZVECIN -> it served no purpose. --- src/trans/algor/butterfly_alg_mod.F90 | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/trans/algor/butterfly_alg_mod.F90 b/src/trans/algor/butterfly_alg_mod.F90 index f7b5584e..96bac57c 100644 --- a/src/trans/algor/butterfly_alg_mod.F90 +++ b/src/trans/algor/butterfly_alg_mod.F90 @@ -1082,34 +1082,35 @@ SUBROUTINE MULT_P_TR(YDNODE,PVECIN,PVECOUT) END SUBROUTINE MULT_P_TR !================================================================== -SUBROUTINE MULT_P_TRM(YDNODE,KF,PVECIN,PVECOUT) +SUBROUTINE MULT_P_TRM(YDNODE, KF, PVECIN, PVECOUT) ! Multiply matrix by transposed procetion matrix TYPE(NODE_TYPE),INTENT(INOUT) :: YDNODE INTEGER(KIND=JPIM),INTENT(IN) :: KF REAL(KIND=JPRB),INTENT(IN) :: PVECIN(:,:) REAL(KIND=JPRB),INTENT(OUT) :: PVECOUT(:,:) -REAL(KIND=JPRB) :: ZVECOUT(YDNODE%ICOLS,KF), ZVECIN(SIZE(PVECIN(:,1)),KF) -INTEGER(KIND=JPIM) :: JK,JN,IDX,IM,IN,JF +REAL(KIND=JPRB) :: ZVECOUT(YDNODE%ICOLS,KF) +INTEGER(KIND=JPIM) :: JK, JN, IDX, IM, IN, JF !------------------------------------------------------------------ IN = YDNODE%ICOLS-YDNODE%IRANK IM = YDNODE%IRANK -IF(IN>0) THEN - IF (JPRB == JPRD) THEN - ZVECIN(:,:) = PVECIN(:,:) - CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,YDNODE%PNONIM(1),IM,ZVECIN(1,1),IM,0.0_JPRD,ZVECOUT(YDNODE%IRANK+1,1),YDNODE%ICOLS) - ELSE - CALL GEMM('T','N',IN,KF,IM,1.0_JPRM,YDNODE%PNONIM(1),IM,PVECIN(1,1),IM,0.0_JPRM,ZVECOUT(YDNODE%IRANK+1,1),YDNODE%ICOLS) - ENDIF +IF (IN > 0) THEN + CALL GEMM('T', 'N', & + & IN, KF, IM, & + & 1.0_JPRB, & + & YDNODE%PNONIM(1), IM, & + & PVECIN(1,1), IM, & + & 0.0_JPRB, & + & ZVECOUT(YDNODE%IRANK+1,1),YDNODE%ICOLS) ENDIF -DO JF=1,KF - DO JK=1,YDNODE%IRANK +DO JF = 1, KF + DO JK = 1, YDNODE%IRANK IDX = YDNODE%ICLIST(JK) PVECOUT(IDX,JF) = PVECIN(JK,JF) ENDDO - DO JN=YDNODE%IRANK+1,YDNODE%ICOLS + DO JN = YDNODE%IRANK + 1, YDNODE%ICOLS IDX = YDNODE%ICLIST(JN) PVECOUT(IDX,JF) = ZVECOUT(JN,JF) ENDDO From b45854780488e2621d145b32c86fa9ae985b4bd1 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Mon, 20 May 2024 16:23:35 +0000 Subject: [PATCH 36/48] Simplify MULT_P and MULT_P_TR In both cases we have a temporary variable copy the same type and shape as what's being copied into it. These can be deleted. --- src/trans/algor/butterfly_alg_mod.F90 | 52 ++++++++++++++------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/src/trans/algor/butterfly_alg_mod.F90 b/src/trans/algor/butterfly_alg_mod.F90 index 96bac57c..2a6615a6 100644 --- a/src/trans/algor/butterfly_alg_mod.F90 +++ b/src/trans/algor/butterfly_alg_mod.F90 @@ -990,29 +990,30 @@ SUBROUTINE MULT_P(YDNODE,PVECIN,PVECOUT) REAL(KIND=JPRB),INTENT(IN) :: PVECIN(:) REAL(KIND=JPRB),INTENT(OUT) :: PVECOUT(:) -REAL(KIND=JPRB) :: ZVECIN(YDNODE%ICOLS), ZVECOUT(SIZE(PVECOUT)) -INTEGER(KIND=JPIM) :: JN,IDX,IRANK,IM,IN +REAL(KIND=JPRB) :: ZVECIN(YDNODE%ICOLS) +INTEGER(KIND=JPIM) :: JN, IDX, IRANK, IM, IN !--------------------------------------------------------- IRANK = YDNODE%IRANK -DO JN=1,YDNODE%ICOLS +DO JN = 1, YDNODE%ICOLS IDX = YDNODE%ICLIST(JN) - IF(JN <= IRANK) THEN - ZVECOUT(JN) = PVECIN(IDX) + IF (JN <= IRANK) THEN + PVECOUT(JN) = PVECIN(IDX) ELSE ZVECIN(JN) = PVECIN(IDX) ENDIF ENDDO -IF(YDNODE%ICOLS > IRANK) THEN +IF (YDNODE%ICOLS > IRANK) THEN IM = IRANK IN = YDNODE%ICOLS-IRANK - IF (JPRB == JPRD) THEN - CALL GEMV('N',IM,IN,1.0_JPRD,YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1),1,1.0_JPRD,ZVECOUT(1),1) - PVECOUT(:)=ZVECOUT(:) - ELSE - CALL GEMV('N',IM,IN,1.0_JPRM,YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1),1,1.0_JPRM,PVECOUT(1),1) - ENDIF + CALL GEMV('N', & + & IM, IN, & + & 1.0_JPRB, & + & YDNODE%PNONIM(1), IRANK, & + & ZVECIN(IRANK+1), 1, & + & 1.0_JPRB, & + & PVECOUT(1), 1) ENDIF END SUBROUTINE MULT_P @@ -1043,39 +1044,40 @@ SUBROUTINE MULT_PM(YDNODE,KF,KLBETA,PVECIN,PVECOUT) ENDIF ENDDO ENDDO -IF(YDNODE%ICOLS > IRANK) THEN +IF (YDNODE%ICOLS > IRANK) THEN CALL GEMM('N','N',IRANK,KF,IN,1.0_JPRB,& & YDNODE%PNONIM(1),IRANK,ZVECIN(IRANK+1,1),YDNODE%ICOLS,1.0_JPRB,& & PVECOUT(1,1),IRANK) ENDIF END SUBROUTINE MULT_PM !================================================================== -SUBROUTINE MULT_P_TR(YDNODE,PVECIN,PVECOUT) +SUBROUTINE MULT_P_TR(YDNODE, PVECIN, PVECOUT) ! Multiply vector by transposed procetion matrix TYPE(NODE_TYPE),INTENT(INOUT) :: YDNODE REAL(KIND=JPRB),INTENT(IN) :: PVECIN(:) REAL(KIND=JPRB),INTENT(OUT) :: PVECOUT(:) -REAL(KIND=JPRB) :: ZVECOUT(YDNODE%ICOLS), ZVECIN(SIZE(PVECIN)) -INTEGER(KIND=JPIM) :: JK,JN,IDX,IRANK,IM,IN +REAL(KIND=JPRB) :: ZVECOUT(YDNODE%ICOLS) +INTEGER(KIND=JPIM) :: JK, JN, IDX, IRANK, IM, IN !--------------------------------------------------------- IRANK = YDNODE%IRANK IN = YDNODE%ICOLS-IRANK -IF(IN>0) THEN +IF (IN > 0) THEN IM = IRANK - IF (JPRB == JPRD) THEN - ZVECIN(:) = PVECIN(:) - CALL GEMV('T',IM,IN,1.0_JPRD,YDNODE%PNONIM(1),IRANK,ZVECIN(1),1,0.0_JPRD,ZVECOUT(IRANK+1),1) - ELSE - CALL GEMV('T',IM,IN,1.0_JPRM,YDNODE%PNONIM(1),IRANK,PVECIN(1),1,0.0_JPRM,ZVECOUT(IRANK+1),1) - ENDIF + CALL GEMV('T', & + & IM, IN,& + & 1.0_JPRB, & + & YDNODE%PNONIM(1), IRANK, & + & PVECIN(1), 1, & + & 0.0_JPRB, & + & ZVECOUT(IRANK+1), 1) ENDIF -DO JK=1,IRANK +DO JK = 1, IRANK IDX = YDNODE%ICLIST(JK) PVECOUT(IDX) = PVECIN(JK) ENDDO -DO JN=IRANK+1,YDNODE%ICOLS +DO JN = IRANK + 1,YDNODE%ICOLS IDX = YDNODE%ICLIST(JN) PVECOUT(IDX) = ZVECOUT(JN) ENDDO From 2a2581acf38b19862ee141139d828191c9c561dd Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Mon, 20 May 2024 16:26:41 +0000 Subject: [PATCH 37/48] Simplify MULT_BUTM The previous combination of IF statements meant an unnecessary number of different calls to GEMM. GEMM now wraps the double and single precision versions so we can take advantage of this to reduce the amount of code repetition. --- src/trans/algor/butterfly_alg_mod.F90 | 162 +++++++++++++------------- 1 file changed, 82 insertions(+), 80 deletions(-) diff --git a/src/trans/algor/butterfly_alg_mod.F90 b/src/trans/algor/butterfly_alg_mod.F90 index 2a6615a6..dbb5ca8e 100644 --- a/src/trans/algor/butterfly_alg_mod.F90 +++ b/src/trans/algor/butterfly_alg_mod.F90 @@ -63,8 +63,6 @@ MODULE BUTTERFLY_ALG_MOD REAL(KIND=JPRB) , ALLOCATABLE :: COMMSBUF(:) ! for communicating packed bufferfly_structs END TYPE CLONE ! between MPI tasks -LOGICAL, PARAMETER :: LLDOUBLE = (JPRB == JPRD) - CONTAINS !================================================================================ SUBROUTINE CONSTRUCT_BUTTERFLY(PEPS,KCMAX,KM,KN,PMAT,YD_STRUCT) @@ -726,12 +724,12 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) ALLOCATE(ZBETA(ILBETA,KF,0:1)) ! Work space for "beta" ! ONWR 5.4.3 -IF(LLTRANSPOSE) THEN - IF( IKWV == 0 ) THEN - ALLOCATE(ZBETA_D(ILBETA,KF)) - ALLOCATE(ZOUT_D(YD_STRUCT%N_ORDER,KF)) - ALLOCATE(ZIN_D(IRIN,KF)) - ENDIF +IF (LLTRANSPOSE) THEN + IF (IKWV == 0 .AND. JPRB /= JPRD) THEN + ALLOCATE(ZBETA_D(ILBETA,KF)) + ALLOCATE(ZOUT_D(YD_STRUCT%N_ORDER,KF)) + ALLOCATE(ZIN_D(IRIN,KF)) + ENDIF DO JL=ILEVS,0,-1 IBETALV = MOD(JL,2) @@ -747,31 +745,34 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) IM = YNODE%IRANK IF( IM <=0 ) CALL ABOR1('mult_butm: IM<=0 not allowed') IF(IN>0) THEN - IF (LLDOUBLE.OR.(IKWV == 0)) THEN - IF(.NOT.LLDOUBLE) THEN - ALLOCATE(ZPNONIM_D(IM,IN)) - II=0 - DO JN=1,IN - DO JM=1,IM - II = II+1 - ZPNONIM_D(JM,JN) = REAL(YNODE%PNONIM(II),JPRD) - ENDDO + ! Force GEMMs for the zeroth wavenumber to be double precision + IF (IKWV == 0 .AND. JPRB /= JPRD) THEN + ALLOCATE(ZPNONIM_D(IM,IN)) + II = 0 + DO JN = 1, IN + DO JM = 1, IM + II = II + 1 + ZPNONIM_D(JM,JN) = REAL(YNODE%PNONIM(II),JPRD) ENDDO - ZBETA_D(1:IM,1:KF)=REAL(ZBETA(IBTST:IBTST+IM-1,1:KF,IBETALV),JPRD) - CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,& - & ZPNONIM_D(1,1),IM,ZBETA_D(1,1),ILBETA,0.0_JPRD,& - & ZOUT_D(1,1),YD_STRUCT%N_ORDER) - ZVECOUT(YNODE%IRANK+1:YNODE%IRANK+IN,1:KF) = REAL(ZOUT_D(1:IN,1:KF),JPRM) - DEALLOCATE(ZPNONIM_D) - ELSE - CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,& - & YNODE%PNONIM(1),IM,ZBETA(IBTST,1,IBETALV),ILBETA,0.0_JPRD,& - & ZVECOUT(YNODE%IRANK+1,1),YD_STRUCT%N_ORDER) - ENDIF + ENDDO + ZBETA_D(1:IM,1:KF) = REAL(ZBETA(IBTST:IBTST+IM-1,1:KF,IBETALV),JPRD) + CALL GEMM('T', 'N', & + & IN, KF, IM, & + & 1.0_JPRD, & + & ZPNONIM_D(1,1), IM, & + & ZBETA_D(1,1), ILBETA, & + & 0.0_JPRD, & + & ZOUT_D(1,1), YD_STRUCT%N_ORDER) + ZVECOUT(YNODE%IRANK+1:YNODE%IRANK+IN,1:KF) = REAL(ZOUT_D(1:IN,1:KF),JPRB) + DEALLOCATE(ZPNONIM_D) ELSE - CALL GEMM('T','N',IN,KF,IM,1.0_JPRM,& - & YNODE%PNONIM(1),IM,ZBETA(IBTST,1,IBETALV),ILBETA,0.0_JPRM,& - & ZVECOUT(YNODE%IRANK+1,1),YD_STRUCT%N_ORDER) + CALL GEMM('T', 'N', & + & IN, KF, IM, & + & 1.0_JPRB, & + & YNODE%PNONIM(1), IM, & + & ZBETA(IBTST,1,IBETALV), ILBETA, & + & 0.0_JPRB, & + & ZVECOUT(YNODE%IRANK+1,1), YD_STRUCT%N_ORDER) ENDIF ENDIF DO JF=1,KF @@ -790,29 +791,29 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) ILR = YNODE%ILROW IROWS =YNODE%IROWS IRANK = YNODE%IRANK - IF (LLDOUBLE.OR.(IKWV == 0)) THEN - IF(.NOT.LLDOUBLE) THEN - ALLOCATE(ZB_D(IROWS,IRANK)) - ZB_D(1:IROWS,1:IRANK) = REAL(YNODE%B(1:IROWS,1:IRANK),JPRD) - ZIN_D(1:ILR-IFR+1,1:KF) = REAL(PVECIN(IFR:ILR,1:KF),JPRD) - - CALL GEMM('T','N',IRANK,KF,IROWS,1.0_JPRD,& - & ZB_D,IROWS,ZIN_D,IRIN,0.0_JPRD,& - & ZBETA_D,ILBETA) - - ZBETA(IBTST:IBTST+IRANK-1,1:KF,IBETALV)=REAL(ZBETA_D(1:IRANK,1:KF),JPRM) - DEALLOCATE(ZB_D) - - ELSE - CALL GEMM('T','N',IRANK,KF,IROWS,1.0_JPRD,& - & YNODE%B(1,1),IROWS,PVECIN(IFR,1),IRIN,0.0_JPRD,& - & ZBETA(IBTST,1,IBETALV),ILBETA) - END IF + ! Force GEMMs for the zeroth wavenumber to be double precision + IF (IKWV == 0 .AND. JPRB /= JPRD) THEN + ALLOCATE(ZB_D(IROWS,IRANK)) + ZB_D(1:IROWS,1:IRANK) = REAL(YNODE%B(1:IROWS,1:IRANK),JPRD) + ZIN_D(1:ILR-IFR+1,1:KF) = REAL(PVECIN(IFR:ILR,1:KF),JPRD) + CALL GEMM('T', 'N', & + & IRANK, KF, IROWS, & + & 1.0_JPRD, & + & ZB_D, IROWS, & + & ZIN_D, IRIN, & + & 0.0_JPRD, & + & ZBETA_D, ILBETA) + ZBETA(IBTST:IBTST+IRANK-1,1:KF,IBETALV) = REAL(ZBETA_D(1:IRANK,1:KF),JPRM) + DEALLOCATE(ZB_D) ELSE - CALL GEMM('T','N',IRANK,KF,IROWS,1.0_JPRM,& - & YNODE%B(1,1),IROWS,PVECIN(IFR,1),IRIN,0.0_JPRM,& - & ZBETA(IBTST,1,IBETALV),ILBETA) - ENDIF + CALL GEMM('T', 'N', & + & IRANK, KF, IROWS, & + & 1.0_JPRB, & + & YNODE%B(1,1), IROWS, & + & PVECIN(IFR,1), IRIN, & + & 0.0_JPRB, & + & ZBETA(IBTST,1,IBETALV), ILBETA) + END IF ENDIF ILM1 = JL-1 IBETALVM1=MOD(ILM1,2) @@ -834,33 +835,34 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) IM = YNODE%IRANK IF( IM <=0 ) CALL ABOR1('mult_butm: IM<=0 not allowed') IF(IN>0) THEN - IF (LLDOUBLE.OR.(IKWV == 0)) THEN - IF(.NOT.LLDOUBLE) THEN - ALLOCATE(ZPNONIM_D(IM,IN)) - II=0 - DO JN=1,IN - DO JM=1,IM - II = II+1 - ZPNONIM_D(JM,JN) = REAL(YNODE%PNONIM(II),JPRD) - ENDDO - ENDDO - ZBETA_D(1:IM,1:KF)=REAL(ZBETA(IBTST:IBTST+IM-1,1:KF,IBETALV),JPRD) - - CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,& - & ZPNONIM_D,IM,ZBETA_D,ILBETA,0.0_JPRD,& - & ZOUT_D,YD_STRUCT%N_ORDER) - - ZVECOUT(YNODE%IRANK+1:YNODE%IRANK+IN,1:KF) = REAL(ZOUT_D(1:IN,1:KF),JPRM) - DEALLOCATE(ZPNONIM_D) - ELSE - CALL GEMM('T','N',IN,KF,IM,1.0_JPRD,& - & YNODE%PNONIM(1),IM,ZBETA(IBTST,1,IBETALV),ILBETA,0.0_JPRD,& - & ZVECOUT(YNODE%IRANK+1,1),YD_STRUCT%N_ORDER) - ENDIF + ! Force GEMMs for the zeroth wavenumber to be double precision + IF (IKWV == 0 .AND. JPRB /= JPRD) THEN + ALLOCATE(ZPNONIM_D(IM,IN)) + II = 0 + DO JN = 1, IN + DO JM = 1, IM + II = II + 1 + ZPNONIM_D(JM,JN) = REAL(YNODE%PNONIM(II),JPRD) + ENDDO + ENDDO + ZBETA_D(1:IM,1:KF) = REAL(ZBETA(IBTST:IBTST+IM-1,1:KF,IBETALV),JPRD) + CALL GEMM('T', 'N', & + & IN, KF, IM, & + & 1.0_JPRD, & + & ZPNONIM_D, IM, & + & ZBETA_D, ILBETA, & + & 0.0_JPRD,& + & ZOUT_D, YD_STRUCT%N_ORDER) + ZVECOUT(YNODE%IRANK+1:YNODE%IRANK+IN,1:KF) = REAL(ZOUT_D(1:IN,1:KF),JPRM) + DEALLOCATE(ZPNONIM_D) ELSE - CALL GEMM('T','N',IN,KF,IM,1.0_JPRM,& - & YNODE%PNONIM(1),IM,ZBETA(IBTST,1,IBETALV),ILBETA,0.0_JPRM,& - & ZVECOUT(YNODE%IRANK+1,1),YD_STRUCT%N_ORDER) + CALL GEMM('T', 'N', & + & IN, KF, IM, & + & 1.0_JPRB, & + & YNODE%PNONIM(1), IM, & + & ZBETA(IBTST,1,IBETALV), ILBETA, & + & 0.0_JPRB, & + & ZVECOUT(YNODE%IRANK+1,1), YD_STRUCT%N_ORDER) ENDIF ENDIF DO JF=1,KF @@ -894,7 +896,7 @@ SUBROUTINE MULT_BUTM(CDTRANS,YD_STRUCT,KF,PVECIN,PVECOUT,KWV) ENDDO ENDDO - IF( IKWV == 0 ) THEN + IF (IKWV == 0 .AND. JPRB /= JPRD) THEN DEALLOCATE(ZBETA_D) DEALLOCATE(ZOUT_D) DEALLOCATE(ZIN_D) From 7e6e8366ca7728efc036cdc662e8ae7b887bc4e8 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 27 Jun 2024 08:32:47 +0000 Subject: [PATCH 38/48] Add back LDUSEFFTW with a deprecation warning --- src/trans/external/setup_trans.F90 | 9 ++++++++- src/trans/include/ectrans/setup_trans.h | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/trans/external/setup_trans.F90 b/src/trans/external/setup_trans.F90 index 7ebdace8..2149b0ad 100644 --- a/src/trans/external/setup_trans.F90 +++ b/src/trans/external/setup_trans.F90 @@ -10,7 +10,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& &KTMAX,KRESOL,PWEIGHT,LDGRIDONLY,LDUSERPNM,LDKEEPRPNM,LDUSEFLT,& -&LDSPSETUPONLY,LDPNMONLY,& +&LDSPSETUPONLY,LDPNMONLY,LDUSEFFTW,& &LDLL,LDSHIFTLL,CDIO_LEGPOL,CDLEGPOLFNAME,KLEGPOLPTR,KLEGPOLPTR_LEN) !**** *SETUP_TRANS* - Setup transform package for specific resolution @@ -53,6 +53,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! LDKEEPRPNM - Keep Legendre Polynomials (only applicable when using ! FLT, otherwise always kept) ! LDPNMONLY - Compute the Legendre polynomials only, not the FFTs. +! LDUSEFFTW - Use FFTW for FFTs (option deprecated - FFTW is now mandatory) ! LDLL - Setup second set of input/output latitudes ! the number of input/output latitudes to transform is equal KDGL ! or KDGL+2 in the case that includes poles + equator @@ -142,6 +143,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& LOGICAL ,OPTIONAL,INTENT(IN):: LDKEEPRPNM LOGICAL ,OPTIONAL,INTENT(IN):: LDSPSETUPONLY LOGICAL ,OPTIONAL,INTENT(IN):: LDPNMONLY +LOGICAL ,OPTIONAL,INTENT(IN):: LDUSEFFTW LOGICAL ,OPTIONAL,INTENT(IN):: LDLL LOGICAL ,OPTIONAL,INTENT(IN):: LDSHIFTLL CHARACTER(LEN=*),OPTIONAL,INTENT(IN):: CDIO_LEGPOL @@ -321,6 +323,11 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& D%LCPNMONLY=LDPNMONLY ENDIF +IF(PRESENT(LDUSEFFTW)) THEN + WRITE(NOUT,*) 'LDUSEFFTW option provided to SETUP_TRANS' + WRITE(NOUT,*) 'FFTW is now mandatory so this option is deprecated' +ENDIF + S%LSOUTHPNM=.FALSE. IF(PRESENT(PSTRET)) THEN IF (ABS(PSTRET-1.0_JPRD)>100._JPRD*EPSILON(1._JPRD)) THEN diff --git a/src/trans/include/ectrans/setup_trans.h b/src/trans/include/ectrans/setup_trans.h index aa8dd833..72e6b5a6 100644 --- a/src/trans/include/ectrans/setup_trans.h +++ b/src/trans/include/ectrans/setup_trans.h @@ -11,7 +11,7 @@ INTERFACE SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& &KTMAX,KRESOL,PWEIGHT,LDGRIDONLY,LDUSERPNM,LDKEEPRPNM,LDUSEFLT,& -&LDSPSETUPONLY,LDPNMONLY,& +&LDSPSETUPONLY,LDPNMONLY,LDUSEFFTW,& &LDLL,LDSHIFTLL,CDIO_LEGPOL,CDLEGPOLFNAME,KLEGPOLPTR,KLEGPOLPTR_LEN) !**** *SETUP_TRANS* - Setup transform package for specific resolution @@ -51,6 +51,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! LDKEEPRPNM - Keep Legendre Polynomials (only applicable when using ! FLT, otherwise always kept) ! LDPNMONLY - Compute the Legendre polynomialsonly, not the FFTs. +! LDUSEFFTW - Use FFTW for FFTs (option deprecated - FFTW is now mandatory) ! LDLL - Setup second set of input/output latitudes ! the number of input/output latitudes to transform is equal KDGL ! or KDGL+2 in the case that includes poles + equator @@ -100,6 +101,7 @@ LOGICAL ,OPTIONAL,INTENT(IN):: LDUSERPNM LOGICAL ,OPTIONAL,INTENT(IN):: LDKEEPRPNM LOGICAL ,OPTIONAL,INTENT(IN):: LDPNMONLY LOGICAL ,OPTIONAL,INTENT(IN):: LDSPSETUPONLY +LOGICAL ,OPTIONAL,INTENT(IN):: LDUSEFFTW LOGICAL ,OPTIONAL,INTENT(IN):: LDLL LOGICAL ,OPTIONAL,INTENT(IN):: LDSHIFTLL CHARACTER(LEN=*),OPTIONAL,INTENT(IN):: CDIO_LEGPOL From 7b1a69a2518f92a0ca2e88923cac3f7d59f1ea62 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 2 Jul 2024 13:53:44 +0000 Subject: [PATCH 39/48] Fix bug in median calculation With the old style, the calculation of the median was broken for runs with a single iteration (accessing the zeroth element of the time step storage array) and did not give a conventional value for runs with two iterations (it should be the average of the two). Resolves #108. --- src/programs/ectrans-benchmark.F90 | 46 +++++++++++++++++------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index 36fa5249..748d88a5 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -849,21 +849,15 @@ program transform_test ztstepavg = (ztstepavg/real(nproc,jprb))/real(iters,jprd) ztloop = ztloop/real(nproc,jprd) ztstep(:) = ztstep(:)/real(nproc,jprd) - -call sort(ztstep,iters) -ztstepmed = ztstep(iters/2) +ztstepmed = get_median(ztstep) ztstepavg1 = (ztstepavg1/real(nproc,jprb))/real(iters,jprd) ztstep1(:) = ztstep1(:)/real(nproc,jprd) - -call sort(ztstep1, iters) -ztstepmed1 = ztstep1(iters/2) +ztstepmed1 = get_median(ztstep1) ztstepavg2 = (ztstepavg2/real(nproc,jprb))/real(iters,jprd) ztstep2(:) = ztstep2(:)/real(nproc,jprd) - -call sort(ztstep2,iters) -ztstepmed2 = ztstep2(iters/2) +ztstepmed2 = get_median(ztstep2) write(nout,'(a)') '======= Start of time step stats =======' write(nout,'(" ")') @@ -1139,27 +1133,39 @@ end subroutine str2int !=================================================================================================== -subroutine sort(a, n) +function get_median(vec) result(median) - integer(kind=jpim), intent(in) :: n - real(kind=jprd), intent(inout) :: a(n) + real(kind=jprd), intent(in) :: vec(:) + real(kind=jprd) :: median + real(kind=jprd) :: vec_sorted(size(vec)) real(kind=jprd) :: x - integer :: i, j + integer :: i, j, n + + n = size(vec) + ! Sort in ascending order + vec_sorted = vec do i = 2, n - x = a(i) + x = vec_sorted(i) j = i - 1 do while (j >= 1) - if (a(j) <= x) exit - a(j + 1) = a(j) - j = j - 1 - end do - a(j + 1) = x + if (vec_sorted(j) <= x) exit + vec_sorted(j + 1) = vec_sorted(j) + j = j - 1 + end do + vec_sorted(j + 1) = x end do -end subroutine sort + ! Calculate median according to if there is an even or odd number of elements + if (mod(n, 2) == 0) then + median = (vec_sorted(n/2) + vec_sorted(n/2+1))/2.0_jprd + else + median = vec_sorted((n+1)/2) + endif + +end function get_median !=================================================================================================== From 55c5220b7a9221e475157306929fd55bd7bbafb2 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 3 Jul 2024 09:37:14 +0000 Subject: [PATCH 40/48] Fix bug in zmaxerrg calculation Previously we completely ignored zmaxerr(4)! --- src/programs/ectrans-benchmark.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index 748d88a5..9e42f40a 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -790,7 +790,7 @@ program transform_test enddo ! maximum error across all fields - zmaxerrg = max(max(zmaxerr(1),zmaxerr(2)), max(zmaxerr(2), zmaxerr(3))) + zmaxerrg = max(zmaxerr(1), zmaxerr(2), zmaxerr(3), zmaxerr(4)) if (verbosity >= 1) write(nout,*) write(nout,'("max error zspvor(1:nlev,:) = ",e10.3)') zmaxerr(3) From a71995badfa094a8e4eaa7787307b22da55abe9d Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 3 Jul 2024 10:11:58 +0000 Subject: [PATCH 41/48] Wrap all references to zspsc3a with if (nfld > 0) checks Now that we actually check the error of the scalar fields instead of ignoring it, we have to make sure we only compute and print the error norms when the user requests more than 0 scalar fields. Otherwise we'll get NaNs and infinities. --- src/programs/ectrans-benchmark.F90 | 56 +++++++++++++++++++----------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index 9e42f40a..2e5232b9 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -540,10 +540,12 @@ program transform_test write(nout,'("norm zspdiv( ",i4,",:) = ",f20.15)') ifld, znormdiv1(ifld) write(nout,'("0x",Z16.16)') znormdiv1(ifld) enddo - do ifld = 1, nflevg - write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15)') ifld, znormt1(ifld) - write(nout,'("0x",Z16.16)') znormt1(ifld) - enddo + if (nfld > 0) then + do ifld = 1, nflevg + write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15)') ifld, znormt1(ifld) + write(nout,'("0x",Z16.16)') znormt1(ifld) + enddo + endif do ifld = 1, 1 write(nout,'("norm zspsc2( ",i4,",:) = ",f20.15)') ifld, znormsp1(ifld) write(nout,'("0x",Z16.16)') znormsp1(ifld) @@ -723,13 +725,19 @@ program transform_test zmaxerr(3) = max(zmaxerr(3),zerr(3)) enddo ! Temperature - do ifld = 1, nflevg - zerr(4) = abs(znormt1(ifld)/znormt(ifld) - 1.0_jprb) - zmaxerr(4) = max(zmaxerr(4), zerr(4)) - enddo - write(nout,'("time step ",i6," took", f8.4," | zspvor max err="e10.3,& - & " | zspdiv max err="e10.3," | zspsc3a max err="e10.3," | zspsc2 max err="e10.3)') & - & jstep, ztstep(jstep), zmaxerr(3), zmaxerr(2), zmaxerr(4), zmaxerr(1) + if (nfld > 0) then + do ifld = 1, nflevg + zerr(4) = abs(znormt1(ifld)/znormt(ifld) - 1.0_jprb) + zmaxerr(4) = max(zmaxerr(4), zerr(4)) + enddo + write(nout,'("time step ",i6," took", f8.4," | zspvor max err="e10.3,& + & " | zspdiv max err="e10.3," | zspsc3a max err="e10.3," | zspsc2 max err="e10.3)') & + & jstep, ztstep(jstep), zmaxerr(3), zmaxerr(2), zmaxerr(4), zmaxerr(1) + else + write(nout,'("time step ",i6," took", f8.4," | zspvor max err="e10.3,& + & " | zspdiv max err="e10.3," | zspsc2 max err="e10.3)') & + & jstep, ztstep(jstep), zmaxerr(3), zmaxerr(2), zmaxerr(1) + endif endif call gstats(6,1) else @@ -772,14 +780,16 @@ program transform_test write(nout,'("0x",Z16.16)') znormdiv(ifld) endif enddo - do ifld = 1, nflevg - zerr(4) = abs(real(znormt1(ifld),kind=jprd)/real(znormt(ifld),kind=jprd) - 1.0d0) - zmaxerr(4) = max(zmaxerr(4), zerr(4)) - if (verbosity >= 1) then - write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15," error = ",e10.3)') ifld, znormt(ifld), zerr(4) - write(nout,'("0x",Z16.16)') znormt(ifld) - endif - enddo + if (nfld > 0) then + do ifld = 1, nflevg + zerr(4) = abs(real(znormt1(ifld),kind=jprd)/real(znormt(ifld),kind=jprd) - 1.0d0) + zmaxerr(4) = max(zmaxerr(4), zerr(4)) + if (verbosity >= 1) then + write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15," error = ",e10.3)') ifld, znormt(ifld), zerr(4) + write(nout,'("0x",Z16.16)') znormt(ifld) + endif + enddo + endif do ifld = 1, 1 zerr(1) = abs(real(znormsp1(ifld),kind=jprd)/real(znormsp(ifld),kind=jprd) - 1.0d0) zmaxerr(1) = max(zmaxerr(1), zerr(1)) @@ -790,12 +800,16 @@ program transform_test enddo ! maximum error across all fields - zmaxerrg = max(zmaxerr(1), zmaxerr(2), zmaxerr(3), zmaxerr(4)) + if (nfld > 0) then + zmaxerrg = max(zmaxerr(1), zmaxerr(2), zmaxerr(3), zmaxerr(4)) + else + zmaxerrg = max(zmaxerr(1), zmaxerr(2), zmaxerr(3)) + endif if (verbosity >= 1) write(nout,*) write(nout,'("max error zspvor(1:nlev,:) = ",e10.3)') zmaxerr(3) write(nout,'("max error zspdiv(1:nlev,:) = ",e10.3)') zmaxerr(2) - write(nout,'("max error zspsc3a(1:nlev,:,1) = ",e10.3)') zmaxerr(4) + if (nfld > 0) write(nout,'("max error zspsc3a(1:nlev,:,1) = ",e10.3)') zmaxerr(4) write(nout,'("max error zspsc2(1:1,:) = ",e10.3)') zmaxerr(1) write(nout,*) write(nout,'("max error combined = = ",e10.3)') zmaxerrg From 04e4c8006b936a8297d6bac55c823bef6d13a97c Mon Sep 17 00:00:00 2001 From: DJDavies2 Date: Tue, 9 Jul 2024 10:17:27 +0100 Subject: [PATCH 42/48] Initialise pointer --- src/trans/internal/tpm_fftw.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/internal/tpm_fftw.F90 b/src/trans/internal/tpm_fftw.F90 index f3faef0c..1800c2f7 100644 --- a/src/trans/internal/tpm_fftw.F90 +++ b/src/trans/internal/tpm_fftw.F90 @@ -42,7 +42,7 @@ MODULE TPM_FFTW TYPE FFTW_TYPE INTEGER(KIND=JPIM),ALLOCATABLE :: N_PLANS(:) - TYPE(FFTW_PLAN),POINTER :: FFTW_PLANS(:) + TYPE(FFTW_PLAN),POINTER :: FFTW_PLANS(:) => NULL() INTEGER(KIND=JPIM) :: N_MAX=0 ! maximum number of latitudes INTEGER(KIND=JPIM) :: N_MAX_PLANS=4 ! maximum number of plans for each active latitudes END TYPE FFTW_TYPE From 7e74b0ee56a22c732735ddf9be18a0a6e0f31fd5 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 9 Jul 2024 13:59:05 +0100 Subject: [PATCH 43/48] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ac5d987a..18b9abd9 100644 --- a/README.md +++ b/README.md @@ -85,10 +85,10 @@ Optionally, tests can be run to check succesful compilation, when the feature TE Reporting Bugs ============== -TODO +Please report bugs using a [GitHub issue](https://github.com/ecmwf-ifs/ectrans/issues). Support is given on a best-effort basis by package developers. Contributing ============ -TODO +Contributions to ecTrans are welcome. In order to do so, please open a [GitHub issue](https://github.com/ecmwf-ifs/ectrans/issues) where a feature request or bug can be discussed. Then create a [pull request](https://github.com/ecmwf-ifs/ectrans/pulls) with your contribution. All contributors to the pull request need to sign the [contributors license agreement (CLA)](https://claassistant.ecmwf.int/ecmwf-ifs/ectrans). From c8523be412cc4038a1bb8b389f5d983c5ae86c46 Mon Sep 17 00:00:00 2001 From: David Davies Date: Fri, 12 Jul 2024 11:34:23 +0100 Subject: [PATCH 44/48] Reset N_MAX to 0 in DESTROY_PLANS_FFTW (#119) --- src/trans/internal/tpm_fftw.F90 | 1 + 1 file changed, 1 insertion(+) diff --git a/src/trans/internal/tpm_fftw.F90 b/src/trans/internal/tpm_fftw.F90 index 1800c2f7..f424d0e3 100644 --- a/src/trans/internal/tpm_fftw.F90 +++ b/src/trans/internal/tpm_fftw.F90 @@ -241,6 +241,7 @@ SUBROUTINE DESTROY_PLANS_FFTW IF( ASSOCIATED(TW) ) THEN IF( ASSOCIATED(TW%FFTW_PLANS) ) DEALLOCATE(TW%FFTW_PLANS) IF( ALLOCATED(TW%N_PLANS) ) DEALLOCATE(TW%N_PLANS) + TW%N_MAX=0 ENDIF RETURN END SUBROUTINE DESTROY_PLANS_FFTW From ec5033957e408b5b3bfc0467db1dd300f3870831 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Mon, 15 Jul 2024 09:53:58 +0100 Subject: [PATCH 45/48] Removed NCOMBFLEN parameter This was effectively deprecated in 8aaf3ec. --- src/programs/ectrans-benchmark.F90 | 3 +-- src/trans/external/setup_trans0.F90 | 10 ++++++---- src/trans/include/ectrans/setup_trans0.h | 2 +- src/trans/internal/tpm_distr.F90 | 1 - 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index 2e5232b9..35059838 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -147,7 +147,6 @@ program transform_test integer(kind=jpim) :: nmax_resol = 37 ! Max number of resolutions integer(kind=jpim) :: npromatr = 0 ! nproma for trans lib -integer(kind=jpim) :: ncombflen = 1800000 ! Size of comm buffer integer(kind=jpim) :: nproc ! Number of procs integer(kind=jpim) :: nthread @@ -379,7 +378,7 @@ program transform_test call gstats(1, 0) call setup_trans0(kout=nout, kerr=nerr, kprintlev=merge(2, 0, verbosity == 1), & & kmax_resol=nmax_resol, kpromatr=npromatr, kprgpns=nprgpns, kprgpew=nprgpew, & - & kprtrw=nprtrw, kcombflen=ncombflen, ldsync_trans=lsync_trans, & + & kprtrw=nprtrw, ldsync_trans=lsync_trans, & & ldeq_regions=leq_regions, prad=zra, ldalloperm=.true., ldmpoff=.not.luse_mpi) call gstats(1, 1) diff --git a/src/trans/external/setup_trans0.F90 b/src/trans/external/setup_trans0.F90 index 8f63af35..cdd581f1 100644 --- a/src/trans/external/setup_trans0.F90 +++ b/src/trans/external/setup_trans0.F90 @@ -34,7 +34,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& ! KPRGPNS - splitting level in N-S direction in grid-point space [1] ! KPRGPEW - splitting level in E-W direction in grid-point space [1] ! KPRTRW - splitting level in wave direction in spectral space [1] -! KCOMBFLEN - Size of communication buffer [1800000 (*8bytes) ] +! KCOMBFLEN - Size of communication buffer [1800000 (*8bytes) ] (deprecated) ! LDMPOFF - switch off message passing [false] ! LDSYNC_TRANS - switch to activate barriers in trmtol trltom [false] ! KTRANS_SYNC_LEVEL - use of synchronization/blocking [0] @@ -74,7 +74,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& USE TPM_GEN ,ONLY : NERR, NOUT, LMPOFF, LSYNC_TRANS, NTRANS_SYNC_LEVEL, MSETUP0, & & NMAX_RESOL, NPRINTLEV, NPROMATR, LALLOPERM, NSTACK_MEMORY_TR -USE TPM_DISTR ,ONLY : LEQ_REGIONS, NCOMBFLEN, NPRGPEW,NPRGPNS, NPRTRW +USE TPM_DISTR ,ONLY : LEQ_REGIONS, NPRGPEW, NPRGPNS, NPRTRW USE TPM_CONSTANTS ,ONLY : RA USE SUMP_TRANS0_MOD ,ONLY : SUMP_TRANS0 @@ -121,7 +121,6 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& N_REGIONS_NS=1 N_REGIONS_EW=1 NPROMATR = 0 -NCOMBFLEN = 1800000 LMPOFF = .FALSE. LSYNC_TRANS=.FALSE. NTRANS_SYNC_LEVEL=0 @@ -171,7 +170,10 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& NPRTRW = KPRTRW ENDIF IF(PRESENT(KCOMBFLEN)) THEN - NCOMBFLEN = KCOMBFLEN + WRITE(NOUT,'(A)') + WRITE(NOUT,'(A)') '*** WARNING ***' + WRITE(NOUT,'(A)') 'KCOMBFLEN argument passed to SETUP_TRANS0 is deprecated' + WRITE(NOUT,'(A)') ENDIF IF(PRESENT(LDMPOFF)) THEN LMPOFF = LDMPOFF diff --git a/src/trans/include/ectrans/setup_trans0.h b/src/trans/include/ectrans/setup_trans0.h index d47d103f..26893e5c 100644 --- a/src/trans/include/ectrans/setup_trans0.h +++ b/src/trans/include/ectrans/setup_trans0.h @@ -35,7 +35,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& ! KPRGPNS - splitting level in N-S direction in grid-point space [1] ! KPRGPEW - splitting level in E-W direction in grid-point space [1] ! KPRTRW - splitting level in wave direction in spectral space [1] -! KCOMBFLEN - Size of communication buffer [1800000 (*8bytes) ] +! KCOMBFLEN - Size of communication buffer [1800000 (*8bytes) ] (deprecated) ! LDMPOFF - switch off message passing [false] ! LDSYNC_TRANS - switch to activate barrier before transforms [false] ! KTRANS_SYNC_LEVEL - use of synchronization/blocking [0] diff --git a/src/trans/internal/tpm_distr.F90 b/src/trans/internal/tpm_distr.F90 index 8ada1a93..ce0064f6 100644 --- a/src/trans/internal/tpm_distr.F90 +++ b/src/trans/internal/tpm_distr.F90 @@ -32,7 +32,6 @@ MODULE TPM_DISTR INTEGER(KIND=JPIM) :: MYPROC ! My processor number INTEGER(KIND=JPIM) :: MYSETW ! My set number in wave direction (spectral space) INTEGER(KIND=JPIM) :: MYSETV ! My set number in field direction(S.S and F.S) -INTEGER(KIND=JPIM) :: NCOMBFLEN ! Size of communication buffer INTEGER(KIND=JPIM) :: MTAGLETR ! Tag INTEGER(KIND=JPIM) :: MTAGML ! Tag From 548cce220dc49c679b81875c1b33db15ce07526a Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Fri, 19 Jul 2024 14:17:48 +0100 Subject: [PATCH 46/48] Add GPU capability (#106 , #120) The work of many years by many people finally contributed. --------- Co-authored-by: Andreas Mueller Co-authored-by: Nils Wedi Co-authored-by: Willem Deconinck Co-authored-by: Olivier Marsden Co-authored-by: Lukas Mosimann --- AUTHORS | 13 +- CMakeLists.txt | 114 +- README.md | 31 +- .../FindCUDAToolkit.cmake | 1026 +++++ cmake/ectrans_compile_options.cmake | 11 + cmake/ectrans_find_cuda.cmake | 25 + cmake/ectrans_find_hip.cmake | 150 + cmake/ectrans_macros.cmake | 8 + cmake/project_summary.cmake | 7 +- src/programs/CMakeLists.txt | 67 +- src/programs/ectrans-benchmark-ifs.F90 | 1506 +++++++ src/programs/ectrans-benchmark.F90 | 92 +- src/trans/CMakeLists.txt | 255 +- src/trans/cpu/CMakeLists.txt | 265 ++ .../{ => cpu}/algor/butterfly_alg_mod.F90 | 0 .../{ => cpu}/algor/ectrans_blas_mod.F90 | 0 .../{ => cpu}/algor/interpol_decomp_mod.F90 | 0 src/trans/{ => cpu}/algor/seefmm_mix.F90 | 0 src/trans/{ => cpu}/algor/wts500_mod.F90 | 0 src/trans/{ => cpu}/external/dir_trans.F90 | 0 src/trans/{ => cpu}/external/dir_transad.F90 | 0 src/trans/{ => cpu}/external/dist_grid.F90 | 0 src/trans/{ => cpu}/external/dist_grid_32.F90 | 0 src/trans/{ => cpu}/external/dist_spec.F90 | 0 src/trans/{ => cpu}/external/gath_grid.F90 | 0 src/trans/{ => cpu}/external/gath_grid_32.F90 | 0 src/trans/{ => cpu}/external/gath_spec.F90 | 0 src/trans/{ => cpu}/external/get_current.F90 | 0 src/trans/{ => cpu}/external/gpnorm_trans.F90 | 0 .../{ => cpu}/external/ini_spec_dist.F90 | 0 src/trans/{ => cpu}/external/inv_trans.F90 | 0 src/trans/{ => cpu}/external/inv_transad.F90 | 0 src/trans/{ => cpu}/external/setup_trans.F90 | 0 src/trans/{ => cpu}/external/setup_trans0.F90 | 1 + src/trans/{ => cpu}/external/specnorm.F90 | 0 src/trans/{ => cpu}/external/trans_end.F90 | 0 src/trans/{ => cpu}/external/trans_inq.F90 | 0 src/trans/{ => cpu}/external/trans_pnm.F90 | 0 .../{ => cpu}/external/trans_release.F90 | 0 src/trans/{ => cpu}/external/vordiv_to_uv.F90 | 0 .../{ => cpu}/internal/abort_trans_mod.F90 | 0 src/trans/{ => cpu}/internal/asre1_mod.F90 | 0 src/trans/{ => cpu}/internal/asre1ad_mod.F90 | 0 src/trans/{ => cpu}/internal/asre1b_mod.F90 | 0 src/trans/{ => cpu}/internal/asre1bad_mod.F90 | 0 src/trans/{ => cpu}/internal/cdmap_mod.F90 | 0 src/trans/{ => cpu}/internal/cpledn_mod.F90 | 0 .../{ => cpu}/internal/dealloc_resol_mod.F90 | 0 .../{ => cpu}/internal/dir_trans_ctl_mod.F90 | 0 .../internal/dir_trans_ctlad_mod.F90 | 0 .../internal/dist_grid_32_ctl_mod.F90 | 0 .../{ => cpu}/internal/dist_grid_ctl_mod.F90 | 0 .../internal/dist_spec_control_mod.F90 | 0 .../internal/ectrans_version_mod.F90.in | 0 .../{ => cpu}/internal/eq_regions_mod.F90 | 0 .../{ => cpu}/internal/field_split_mod.F90 | 0 .../{ => cpu}/internal/fourier_in_mod.F90 | 0 .../{ => cpu}/internal/fourier_inad_mod.F90 | 0 .../{ => cpu}/internal/fourier_out_mod.F90 | 0 .../{ => cpu}/internal/fourier_outad_mod.F90 | 0 src/trans/{ => cpu}/internal/fsc_mod.F90 | 0 src/trans/{ => cpu}/internal/fscad_mod.F90 | 0 .../{ => cpu}/internal/fspgl_int_mod.F90 | 0 .../{ => cpu}/internal/ftdir_ctl_mod.F90 | 0 .../{ => cpu}/internal/ftdir_ctlad_mod.F90 | 0 src/trans/{ => cpu}/internal/ftdir_mod.F90 | 0 src/trans/{ => cpu}/internal/ftdirad_mod.F90 | 0 .../{ => cpu}/internal/ftinv_ctl_mod.F90 | 0 .../{ => cpu}/internal/ftinv_ctlad_mod.F90 | 0 src/trans/{ => cpu}/internal/ftinv_mod.F90 | 0 src/trans/{ => cpu}/internal/ftinvad_mod.F90 | 0 .../internal/gath_grid_32_ctl_mod.F90 | 0 .../{ => cpu}/internal/gath_grid_ctl_mod.F90 | 0 .../internal/gath_spec_control_mod.F90 | 0 src/trans/{ => cpu}/internal/gawl_mod.F90 | 0 .../internal/gpnorm_trans_ctl_mod.F90 | 0 src/trans/{ => cpu}/internal/inigptr_mod.F90 | 0 .../{ => cpu}/internal/inv_trans_ctl_mod.F90 | 0 .../internal/inv_trans_ctlad_mod.F90 | 0 src/trans/{ => cpu}/internal/ldfou2_mod.F90 | 0 src/trans/{ => cpu}/internal/ldfou2ad_mod.F90 | 0 src/trans/{ => cpu}/internal/ledir_mod.F90 | 0 src/trans/{ => cpu}/internal/ledirad_mod.F90 | 0 src/trans/{ => cpu}/internal/leinv_mod.F90 | 0 src/trans/{ => cpu}/internal/leinvad_mod.F90 | 0 .../{ => cpu}/internal/ltdir_ctl_mod.F90 | 0 .../{ => cpu}/internal/ltdir_ctlad_mod.F90 | 0 src/trans/{ => cpu}/internal/ltdir_mod.F90 | 0 src/trans/{ => cpu}/internal/ltdirad_mod.F90 | 0 .../{ => cpu}/internal/ltinv_ctl_mod.F90 | 0 .../{ => cpu}/internal/ltinv_ctlad_mod.F90 | 0 src/trans/{ => cpu}/internal/ltinv_mod.F90 | 0 src/trans/{ => cpu}/internal/ltinvad_mod.F90 | 0 .../{ => cpu}/internal/myrecvset_mod.F90 | 0 .../{ => cpu}/internal/mysendset_mod.F90 | 0 src/trans/{ => cpu}/internal/pe2set_mod.F90 | 0 .../{ => cpu}/internal/pre_suleg_mod.F90 | 0 src/trans/{ => cpu}/internal/prepsnm_mod.F90 | 0 src/trans/{ => cpu}/internal/prfi1_mod.F90 | 0 src/trans/{ => cpu}/internal/prfi1ad_mod.F90 | 0 src/trans/{ => cpu}/internal/prfi1b_mod.F90 | 0 src/trans/{ => cpu}/internal/prfi1bad_mod.F90 | 0 src/trans/{ => cpu}/internal/prfi2_mod.F90 | 0 src/trans/{ => cpu}/internal/prfi2ad_mod.F90 | 0 src/trans/{ => cpu}/internal/prfi2b_mod.F90 | 0 src/trans/{ => cpu}/internal/prfi2bad_mod.F90 | 0 .../{ => cpu}/internal/read_legpol_mod.F90 | 0 src/trans/{ => cpu}/internal/set2pe_mod.F90 | 0 .../{ => cpu}/internal/set_resol_mod.F90 | 0 .../{ => cpu}/internal/setup_dims_mod.F90 | 0 .../{ => cpu}/internal/setup_geom_mod.F90 | 0 src/trans/{ => cpu}/internal/shuffle_mod.F90 | 0 .../{ => cpu}/internal/spnorm_ctl_mod.F90 | 0 src/trans/{ => cpu}/internal/spnormc_mod.F90 | 0 src/trans/{ => cpu}/internal/spnormd_mod.F90 | 0 src/trans/{ => cpu}/internal/spnsde_mod.F90 | 0 src/trans/{ => cpu}/internal/spnsdead_mod.F90 | 0 src/trans/{ => cpu}/internal/sugaw_mod.F90 | 0 src/trans/{ => cpu}/internal/suleg_mod.F90 | 0 .../{ => cpu}/internal/sump_trans0_mod.F90 | 0 .../{ => cpu}/internal/sump_trans_mod.F90 | 0 .../internal/sump_trans_preleg_mod.F90 | 0 src/trans/{ => cpu}/internal/sumplat_mod.F90 | 0 src/trans/{ => cpu}/internal/sumplatb_mod.F90 | 0 .../{ => cpu}/internal/sumplatbeq_mod.F90 | 0 src/trans/{ => cpu}/internal/sumplatf_mod.F90 | 0 src/trans/{ => cpu}/internal/supol_mod.F90 | 0 src/trans/{ => cpu}/internal/supolf_mod.F90 | 0 src/trans/{ => cpu}/internal/sustaonl_mod.F90 | 0 src/trans/{ => cpu}/internal/sutrle_mod.F90 | 0 src/trans/{ => cpu}/internal/suwavedi_mod.F90 | 0 .../{ => cpu}/internal/tpm_constants.F90 | 0 src/trans/{ => cpu}/internal/tpm_ctl.F90 | 0 src/trans/{ => cpu}/internal/tpm_dim.F90 | 0 src/trans/{ => cpu}/internal/tpm_distr.F90 | 0 src/trans/{ => cpu}/internal/tpm_fftw.F90 | 0 src/trans/{ => cpu}/internal/tpm_fields.F90 | 0 src/trans/{ => cpu}/internal/tpm_flt.F90 | 0 src/trans/{ => cpu}/internal/tpm_gen.F90 | 0 src/trans/{ => cpu}/internal/tpm_geometry.F90 | 0 src/trans/{ => cpu}/internal/tpm_pol.F90 | 0 src/trans/{ => cpu}/internal/tpm_trans.F90 | 0 src/trans/{ => cpu}/internal/trgtol_mod.F90 | 0 src/trans/{ => cpu}/internal/trltog_mod.F90 | 0 src/trans/{ => cpu}/internal/trltom_mod.F90 | 0 src/trans/{ => cpu}/internal/trmtol_mod.F90 | 0 src/trans/{ => cpu}/internal/updsp_mod.F90 | 0 src/trans/{ => cpu}/internal/updspad_mod.F90 | 0 src/trans/{ => cpu}/internal/updspb_mod.F90 | 0 src/trans/{ => cpu}/internal/updspbad_mod.F90 | 0 src/trans/{ => cpu}/internal/uvtvd_mod.F90 | 0 src/trans/{ => cpu}/internal/uvtvdad_mod.F90 | 0 .../{ => cpu}/internal/vd2uv_ctl_mod.F90 | 0 src/trans/{ => cpu}/internal/vd2uv_mod.F90 | 0 src/trans/{ => cpu}/internal/vdtuv_mod.F90 | 0 src/trans/{ => cpu}/internal/vdtuvad_mod.F90 | 0 .../{ => cpu}/internal/write_legpol_mod.F90 | 0 .../maybe_unused/external/sugawc.F90 | 0 .../maybe_unused/include/ectrans/sugawc.h | 0 src/trans/{ => cpu}/sedrenames.txt | 0 src/trans/{ => cpu}/sharedmem/sharedmem.c | 0 .../{ => cpu}/sharedmem/sharedmem_mod.F90 | 0 src/trans/gpu/CMakeLists.txt | 132 + src/trans/gpu/algor/c_hipmemgetinfo.cpp | 23 + src/trans/gpu/algor/device_mod.F90 | 82 + src/trans/gpu/algor/growing_allocator.h | 4 + src/trans/gpu/algor/hicblas.h | 37 + src/trans/gpu/algor/hicblas_cuda.h | 125 + src/trans/gpu/algor/hicblas_cutlass.cuda.h | 204 + src/trans/gpu/algor/hicblas_gemm.cuda.cu | 1 + src/trans/gpu/algor/hicblas_gemm.hip.cpp | 353 ++ src/trans/gpu/algor/hicblas_hip.h | 78 + src/trans/gpu/algor/hicblas_mod.F90 | 404 ++ src/trans/gpu/algor/hicfft.h | 52 + .../gpu/algor/hicfft_create_plan.cuda.cu | 1 + .../gpu/algor/hicfft_create_plan.hip.cpp | 79 + src/trans/gpu/algor/hicfft_cuda.h | 140 + .../gpu/algor/hicfft_destroy_plan.cuda.cu | 1 + .../gpu/algor/hicfft_destroy_plan.hip.cpp | 23 + .../gpu/algor/hicfft_execute_plan.cuda.cu | 1 + .../gpu/algor/hicfft_execute_plan.hip.cpp | 240 ++ src/trans/gpu/algor/hicfft_hip.h | 91 + src/trans/gpu/algor/seefmm_mix.F90 | 548 +++ src/trans/gpu/algor/wts500_mod.F90 | 3764 +++++++++++++++++ src/trans/gpu/external/dir_trans.F90 | 523 +++ src/trans/gpu/external/dir_transad.F90 | 145 + src/trans/gpu/external/dist_grid.F90 | 147 + src/trans/gpu/external/dist_grid_32.F90 | 140 + src/trans/gpu/external/dist_spec.F90 | 201 + src/trans/gpu/external/gath_grid.F90 | 140 + src/trans/gpu/external/gath_grid_32.F90 | 140 + src/trans/gpu/external/gath_spec.F90 | 194 + src/trans/gpu/external/get_current.F90 | 67 + src/trans/gpu/external/gpnorm_trans.F90 | 482 +++ src/trans/gpu/external/gpnorm_trans_gpu.F90 | 551 +++ src/trans/gpu/external/ini_spec_dist.F90 | 96 + src/trans/gpu/external/inv_trans.F90 | 645 +++ src/trans/gpu/external/inv_transad.F90 | 166 + src/trans/gpu/external/setup_trans.F90 | 705 +++ src/trans/gpu/external/setup_trans0.F90 | 300 ++ src/trans/gpu/external/specnorm.F90 | 140 + src/trans/gpu/external/sugawc.F90 | 102 + src/trans/gpu/external/trans_end.F90 | 154 + src/trans/gpu/external/trans_inq.F90 | 529 +++ src/trans/gpu/external/trans_pnm.F90 | 200 + src/trans/gpu/external/trans_release.F90 | 61 + src/trans/gpu/external/vordiv_to_uv.F90 | 179 + src/trans/gpu/internal/abort_trans_mod.F90 | 39 + .../gpu/internal/buffered_allocator_mod.F90 | 186 + src/trans/gpu/internal/cdmap_mod.F90 | 178 + src/trans/gpu/internal/cpledn_mod.F90 | 134 + src/trans/gpu/internal/dealloc_resol_mod.F90 | 189 + src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 184 + .../gpu/internal/dist_grid_32_ctl_mod.F90 | 258 ++ src/trans/gpu/internal/dist_grid_ctl_mod.F90 | 280 ++ .../gpu/internal/dist_spec_control_mod.F90 | 233 + .../gpu/internal/ectrans_version_mod.F90.in | 47 + src/trans/gpu/internal/eq_regions_mod.F90 | 443 ++ src/trans/gpu/internal/ext_acc.F90 | 357 ++ src/trans/gpu/internal/field_split_mod.F90 | 140 + src/trans/gpu/internal/fsc_mod.F90 | 280 ++ src/trans/gpu/internal/ftdir_mod.F90 | 118 + src/trans/gpu/internal/ftinv_mod.F90 | 118 + .../gpu/internal/gath_grid_32_ctl_mod.F90 | 277 ++ src/trans/gpu/internal/gath_grid_ctl_mod.F90 | 290 ++ .../gpu/internal/gath_spec_control_mod.F90 | 233 + src/trans/gpu/internal/gawl_mod.F90 | 118 + .../gpu/internal/growing_allocator_mod.F90 | 91 + src/trans/gpu/internal/inigptr_mod.F90 | 88 + src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 242 ++ src/trans/gpu/internal/ledir_mod.F90 | 404 ++ src/trans/gpu/internal/leinv_mod.F90 | 418 ++ src/trans/gpu/internal/ltdir_mod.F90 | 294 ++ src/trans/gpu/internal/ltinv_mod.F90 | 409 ++ src/trans/gpu/internal/myrecvset_mod.F90 | 83 + src/trans/gpu/internal/mysendset_mod.F90 | 80 + src/trans/gpu/internal/parkind_ectrans.F90 | 38 + src/trans/gpu/internal/pe2set_mod.F90 | 121 + src/trans/gpu/internal/pre_suleg_mod.F90 | 71 + src/trans/gpu/internal/prepsnm_mod.F90 | 105 + src/trans/gpu/internal/prfi1_mod.F90 | 114 + src/trans/gpu/internal/prfi1b_mod.F90 | 196 + src/trans/gpu/internal/read_legpol_mod.F90 | 235 + src/trans/gpu/internal/set2pe_mod.F90 | 131 + src/trans/gpu/internal/set_resol_mod.F90 | 73 + src/trans/gpu/internal/setup_dims_mod.F90 | 50 + src/trans/gpu/internal/setup_geom_mod.F90 | 110 + src/trans/gpu/internal/shuffle_mod.F90 | 137 + src/trans/gpu/internal/spnorm_ctl_mod.F90 | 62 + src/trans/gpu/internal/spnormc_mod.F90 | 89 + src/trans/gpu/internal/spnormd_mod.F90 | 66 + src/trans/gpu/internal/spnsde_mod.F90 | 151 + src/trans/gpu/internal/sufft_mod.F90 | 48 + src/trans/gpu/internal/sugaw_mod.F90 | 431 ++ src/trans/gpu/internal/suleg_mod.F90 | 877 ++++ src/trans/gpu/internal/sump_trans0_mod.F90 | 115 + src/trans/gpu/internal/sump_trans_mod.F90 | 299 ++ .../gpu/internal/sump_trans_preleg_mod.F90 | 149 + src/trans/gpu/internal/sumplat_mod.F90 | 256 ++ src/trans/gpu/internal/sumplatb_mod.F90 | 226 + src/trans/gpu/internal/sumplatbeq_mod.F90 | 289 ++ src/trans/gpu/internal/sumplatf_mod.F90 | 150 + src/trans/gpu/internal/supol_mod.F90 | 172 + src/trans/gpu/internal/supolf_mod.F90 | 284 ++ src/trans/gpu/internal/sustaonl_mod.F90 | 457 ++ src/trans/gpu/internal/sutrle_mod.F90 | 364 ++ src/trans/gpu/internal/suwavedi_mod.F90 | 186 + src/trans/gpu/internal/tpm_constants.F90 | 20 + src/trans/gpu/internal/tpm_ctl.F90 | 43 + src/trans/gpu/internal/tpm_dim.F90 | 58 + src/trans/gpu/internal/tpm_distr.F90 | 196 + src/trans/gpu/internal/tpm_fft.F90 | 29 + src/trans/gpu/internal/tpm_fields.F90 | 58 + src/trans/gpu/internal/tpm_flt.F90 | 64 + src/trans/gpu/internal/tpm_gen.F90 | 45 + src/trans/gpu/internal/tpm_geometry.F90 | 45 + src/trans/gpu/internal/tpm_hicfft.F90 | 346 ++ src/trans/gpu/internal/tpm_pol.F90 | 120 + src/trans/gpu/internal/tpm_stats.F90 | 66 + src/trans/gpu/internal/tpm_trans.F90 | 69 + src/trans/gpu/internal/trgtol_mod.F90 | 751 ++++ src/trans/gpu/internal/trltog_mod.F90 | 916 ++++ src/trans/gpu/internal/trltom_mod.F90 | 232 + src/trans/gpu/internal/trltom_pack_unpack.F90 | 268 ++ src/trans/gpu/internal/trmtol_mod.F90 | 221 + src/trans/gpu/internal/trmtol_pack_unpack.F90 | 293 ++ src/trans/gpu/internal/updsp_mod.F90 | 164 + src/trans/gpu/internal/updspb_mod.F90 | 143 + src/trans/gpu/internal/uvtvd_mod.F90 | 177 + src/trans/gpu/internal/vd2uv_ctl_mod.F90 | 81 + src/trans/gpu/internal/vd2uv_mod.F90 | 157 + src/trans/gpu/internal/vdtuv_mod.F90 | 165 + src/trans/gpu/internal/write_legpol_mod.F90 | 187 + src/trans/gpu/sharedmem/sharedmem.c | 28 + src/trans/gpu/sharedmem/sharedmem_mod.F90 | 314 ++ src/trans/include/ectrans/vordiv_to_uv.h | 2 +- tests/CMakeLists.txt | 145 +- 297 files changed, 33220 insertions(+), 407 deletions(-) create mode 100644 cmake/FindCUDAToolkit-cmake-3.24/FindCUDAToolkit.cmake create mode 100644 cmake/ectrans_find_cuda.cmake create mode 100644 cmake/ectrans_find_hip.cmake create mode 100644 src/programs/ectrans-benchmark-ifs.F90 create mode 100644 src/trans/cpu/CMakeLists.txt rename src/trans/{ => cpu}/algor/butterfly_alg_mod.F90 (100%) rename src/trans/{ => cpu}/algor/ectrans_blas_mod.F90 (100%) rename src/trans/{ => cpu}/algor/interpol_decomp_mod.F90 (100%) rename src/trans/{ => cpu}/algor/seefmm_mix.F90 (100%) rename src/trans/{ => cpu}/algor/wts500_mod.F90 (100%) rename src/trans/{ => cpu}/external/dir_trans.F90 (100%) rename src/trans/{ => cpu}/external/dir_transad.F90 (100%) rename src/trans/{ => cpu}/external/dist_grid.F90 (100%) rename src/trans/{ => cpu}/external/dist_grid_32.F90 (100%) rename src/trans/{ => cpu}/external/dist_spec.F90 (100%) rename src/trans/{ => cpu}/external/gath_grid.F90 (100%) rename src/trans/{ => cpu}/external/gath_grid_32.F90 (100%) rename src/trans/{ => cpu}/external/gath_spec.F90 (100%) rename src/trans/{ => cpu}/external/get_current.F90 (100%) rename src/trans/{ => cpu}/external/gpnorm_trans.F90 (100%) rename src/trans/{ => cpu}/external/ini_spec_dist.F90 (100%) rename src/trans/{ => cpu}/external/inv_trans.F90 (100%) rename src/trans/{ => cpu}/external/inv_transad.F90 (100%) rename src/trans/{ => cpu}/external/setup_trans.F90 (100%) rename src/trans/{ => cpu}/external/setup_trans0.F90 (99%) rename src/trans/{ => cpu}/external/specnorm.F90 (100%) rename src/trans/{ => cpu}/external/trans_end.F90 (100%) rename src/trans/{ => cpu}/external/trans_inq.F90 (100%) rename src/trans/{ => cpu}/external/trans_pnm.F90 (100%) rename src/trans/{ => cpu}/external/trans_release.F90 (100%) rename src/trans/{ => cpu}/external/vordiv_to_uv.F90 (100%) rename src/trans/{ => cpu}/internal/abort_trans_mod.F90 (100%) rename src/trans/{ => cpu}/internal/asre1_mod.F90 (100%) rename src/trans/{ => cpu}/internal/asre1ad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/asre1b_mod.F90 (100%) rename src/trans/{ => cpu}/internal/asre1bad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/cdmap_mod.F90 (100%) rename src/trans/{ => cpu}/internal/cpledn_mod.F90 (100%) rename src/trans/{ => cpu}/internal/dealloc_resol_mod.F90 (100%) rename src/trans/{ => cpu}/internal/dir_trans_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/dir_trans_ctlad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/dist_grid_32_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/dist_grid_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/dist_spec_control_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ectrans_version_mod.F90.in (100%) rename src/trans/{ => cpu}/internal/eq_regions_mod.F90 (100%) rename src/trans/{ => cpu}/internal/field_split_mod.F90 (100%) rename src/trans/{ => cpu}/internal/fourier_in_mod.F90 (100%) rename src/trans/{ => cpu}/internal/fourier_inad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/fourier_out_mod.F90 (100%) rename src/trans/{ => cpu}/internal/fourier_outad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/fsc_mod.F90 (100%) rename src/trans/{ => cpu}/internal/fscad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/fspgl_int_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ftdir_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ftdir_ctlad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ftdir_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ftdirad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ftinv_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ftinv_ctlad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ftinv_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ftinvad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/gath_grid_32_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/gath_grid_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/gath_spec_control_mod.F90 (100%) rename src/trans/{ => cpu}/internal/gawl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/gpnorm_trans_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/inigptr_mod.F90 (100%) rename src/trans/{ => cpu}/internal/inv_trans_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/inv_trans_ctlad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ldfou2_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ldfou2ad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ledir_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ledirad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/leinv_mod.F90 (100%) rename src/trans/{ => cpu}/internal/leinvad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ltdir_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ltdir_ctlad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ltdir_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ltdirad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ltinv_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ltinv_ctlad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ltinv_mod.F90 (100%) rename src/trans/{ => cpu}/internal/ltinvad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/myrecvset_mod.F90 (100%) rename src/trans/{ => cpu}/internal/mysendset_mod.F90 (100%) rename src/trans/{ => cpu}/internal/pe2set_mod.F90 (100%) rename src/trans/{ => cpu}/internal/pre_suleg_mod.F90 (100%) rename src/trans/{ => cpu}/internal/prepsnm_mod.F90 (100%) rename src/trans/{ => cpu}/internal/prfi1_mod.F90 (100%) rename src/trans/{ => cpu}/internal/prfi1ad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/prfi1b_mod.F90 (100%) rename src/trans/{ => cpu}/internal/prfi1bad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/prfi2_mod.F90 (100%) rename src/trans/{ => cpu}/internal/prfi2ad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/prfi2b_mod.F90 (100%) rename src/trans/{ => cpu}/internal/prfi2bad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/read_legpol_mod.F90 (100%) rename src/trans/{ => cpu}/internal/set2pe_mod.F90 (100%) rename src/trans/{ => cpu}/internal/set_resol_mod.F90 (100%) rename src/trans/{ => cpu}/internal/setup_dims_mod.F90 (100%) rename src/trans/{ => cpu}/internal/setup_geom_mod.F90 (100%) rename src/trans/{ => cpu}/internal/shuffle_mod.F90 (100%) rename src/trans/{ => cpu}/internal/spnorm_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/spnormc_mod.F90 (100%) rename src/trans/{ => cpu}/internal/spnormd_mod.F90 (100%) rename src/trans/{ => cpu}/internal/spnsde_mod.F90 (100%) rename src/trans/{ => cpu}/internal/spnsdead_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sugaw_mod.F90 (100%) rename src/trans/{ => cpu}/internal/suleg_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sump_trans0_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sump_trans_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sump_trans_preleg_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sumplat_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sumplatb_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sumplatbeq_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sumplatf_mod.F90 (100%) rename src/trans/{ => cpu}/internal/supol_mod.F90 (100%) rename src/trans/{ => cpu}/internal/supolf_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sustaonl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/sutrle_mod.F90 (100%) rename src/trans/{ => cpu}/internal/suwavedi_mod.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_constants.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_ctl.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_dim.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_distr.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_fftw.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_fields.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_flt.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_gen.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_geometry.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_pol.F90 (100%) rename src/trans/{ => cpu}/internal/tpm_trans.F90 (100%) rename src/trans/{ => cpu}/internal/trgtol_mod.F90 (100%) rename src/trans/{ => cpu}/internal/trltog_mod.F90 (100%) rename src/trans/{ => cpu}/internal/trltom_mod.F90 (100%) rename src/trans/{ => cpu}/internal/trmtol_mod.F90 (100%) rename src/trans/{ => cpu}/internal/updsp_mod.F90 (100%) rename src/trans/{ => cpu}/internal/updspad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/updspb_mod.F90 (100%) rename src/trans/{ => cpu}/internal/updspbad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/uvtvd_mod.F90 (100%) rename src/trans/{ => cpu}/internal/uvtvdad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/vd2uv_ctl_mod.F90 (100%) rename src/trans/{ => cpu}/internal/vd2uv_mod.F90 (100%) rename src/trans/{ => cpu}/internal/vdtuv_mod.F90 (100%) rename src/trans/{ => cpu}/internal/vdtuvad_mod.F90 (100%) rename src/trans/{ => cpu}/internal/write_legpol_mod.F90 (100%) rename src/trans/{ => cpu}/maybe_unused/external/sugawc.F90 (100%) mode change 100644 => 100755 rename src/trans/{ => cpu}/maybe_unused/include/ectrans/sugawc.h (100%) rename src/trans/{ => cpu}/sedrenames.txt (100%) rename src/trans/{ => cpu}/sharedmem/sharedmem.c (100%) rename src/trans/{ => cpu}/sharedmem/sharedmem_mod.F90 (100%) create mode 100644 src/trans/gpu/CMakeLists.txt create mode 100644 src/trans/gpu/algor/c_hipmemgetinfo.cpp create mode 100644 src/trans/gpu/algor/device_mod.F90 create mode 100644 src/trans/gpu/algor/growing_allocator.h create mode 100644 src/trans/gpu/algor/hicblas.h create mode 100644 src/trans/gpu/algor/hicblas_cuda.h create mode 100644 src/trans/gpu/algor/hicblas_cutlass.cuda.h create mode 120000 src/trans/gpu/algor/hicblas_gemm.cuda.cu create mode 100644 src/trans/gpu/algor/hicblas_gemm.hip.cpp create mode 100644 src/trans/gpu/algor/hicblas_hip.h create mode 100644 src/trans/gpu/algor/hicblas_mod.F90 create mode 100644 src/trans/gpu/algor/hicfft.h create mode 120000 src/trans/gpu/algor/hicfft_create_plan.cuda.cu create mode 100644 src/trans/gpu/algor/hicfft_create_plan.hip.cpp create mode 100644 src/trans/gpu/algor/hicfft_cuda.h create mode 120000 src/trans/gpu/algor/hicfft_destroy_plan.cuda.cu create mode 100644 src/trans/gpu/algor/hicfft_destroy_plan.hip.cpp create mode 120000 src/trans/gpu/algor/hicfft_execute_plan.cuda.cu create mode 100644 src/trans/gpu/algor/hicfft_execute_plan.hip.cpp create mode 100644 src/trans/gpu/algor/hicfft_hip.h create mode 100644 src/trans/gpu/algor/seefmm_mix.F90 create mode 100644 src/trans/gpu/algor/wts500_mod.F90 create mode 100755 src/trans/gpu/external/dir_trans.F90 create mode 100755 src/trans/gpu/external/dir_transad.F90 create mode 100755 src/trans/gpu/external/dist_grid.F90 create mode 100755 src/trans/gpu/external/dist_grid_32.F90 create mode 100755 src/trans/gpu/external/dist_spec.F90 create mode 100755 src/trans/gpu/external/gath_grid.F90 create mode 100755 src/trans/gpu/external/gath_grid_32.F90 create mode 100755 src/trans/gpu/external/gath_spec.F90 create mode 100755 src/trans/gpu/external/get_current.F90 create mode 100755 src/trans/gpu/external/gpnorm_trans.F90 create mode 100755 src/trans/gpu/external/gpnorm_trans_gpu.F90 create mode 100755 src/trans/gpu/external/ini_spec_dist.F90 create mode 100755 src/trans/gpu/external/inv_trans.F90 create mode 100755 src/trans/gpu/external/inv_transad.F90 create mode 100755 src/trans/gpu/external/setup_trans.F90 create mode 100755 src/trans/gpu/external/setup_trans0.F90 create mode 100755 src/trans/gpu/external/specnorm.F90 create mode 100755 src/trans/gpu/external/sugawc.F90 create mode 100755 src/trans/gpu/external/trans_end.F90 create mode 100755 src/trans/gpu/external/trans_inq.F90 create mode 100755 src/trans/gpu/external/trans_pnm.F90 create mode 100755 src/trans/gpu/external/trans_release.F90 create mode 100755 src/trans/gpu/external/vordiv_to_uv.F90 create mode 100755 src/trans/gpu/internal/abort_trans_mod.F90 create mode 100644 src/trans/gpu/internal/buffered_allocator_mod.F90 create mode 100755 src/trans/gpu/internal/cdmap_mod.F90 create mode 100755 src/trans/gpu/internal/cpledn_mod.F90 create mode 100755 src/trans/gpu/internal/dealloc_resol_mod.F90 create mode 100755 src/trans/gpu/internal/dir_trans_ctl_mod.F90 create mode 100755 src/trans/gpu/internal/dist_grid_32_ctl_mod.F90 create mode 100755 src/trans/gpu/internal/dist_grid_ctl_mod.F90 create mode 100755 src/trans/gpu/internal/dist_spec_control_mod.F90 create mode 100644 src/trans/gpu/internal/ectrans_version_mod.F90.in create mode 100755 src/trans/gpu/internal/eq_regions_mod.F90 create mode 100644 src/trans/gpu/internal/ext_acc.F90 create mode 100755 src/trans/gpu/internal/field_split_mod.F90 create mode 100755 src/trans/gpu/internal/fsc_mod.F90 create mode 100755 src/trans/gpu/internal/ftdir_mod.F90 create mode 100755 src/trans/gpu/internal/ftinv_mod.F90 create mode 100755 src/trans/gpu/internal/gath_grid_32_ctl_mod.F90 create mode 100755 src/trans/gpu/internal/gath_grid_ctl_mod.F90 create mode 100755 src/trans/gpu/internal/gath_spec_control_mod.F90 create mode 100755 src/trans/gpu/internal/gawl_mod.F90 create mode 100644 src/trans/gpu/internal/growing_allocator_mod.F90 create mode 100755 src/trans/gpu/internal/inigptr_mod.F90 create mode 100644 src/trans/gpu/internal/inv_trans_ctl_mod.F90 create mode 100755 src/trans/gpu/internal/ledir_mod.F90 create mode 100755 src/trans/gpu/internal/leinv_mod.F90 create mode 100755 src/trans/gpu/internal/ltdir_mod.F90 create mode 100755 src/trans/gpu/internal/ltinv_mod.F90 create mode 100755 src/trans/gpu/internal/myrecvset_mod.F90 create mode 100755 src/trans/gpu/internal/mysendset_mod.F90 create mode 100644 src/trans/gpu/internal/parkind_ectrans.F90 create mode 100755 src/trans/gpu/internal/pe2set_mod.F90 create mode 100755 src/trans/gpu/internal/pre_suleg_mod.F90 create mode 100755 src/trans/gpu/internal/prepsnm_mod.F90 create mode 100755 src/trans/gpu/internal/prfi1_mod.F90 create mode 100755 src/trans/gpu/internal/prfi1b_mod.F90 create mode 100755 src/trans/gpu/internal/read_legpol_mod.F90 create mode 100755 src/trans/gpu/internal/set2pe_mod.F90 create mode 100755 src/trans/gpu/internal/set_resol_mod.F90 create mode 100755 src/trans/gpu/internal/setup_dims_mod.F90 create mode 100755 src/trans/gpu/internal/setup_geom_mod.F90 create mode 100755 src/trans/gpu/internal/shuffle_mod.F90 create mode 100755 src/trans/gpu/internal/spnorm_ctl_mod.F90 create mode 100755 src/trans/gpu/internal/spnormc_mod.F90 create mode 100755 src/trans/gpu/internal/spnormd_mod.F90 create mode 100755 src/trans/gpu/internal/spnsde_mod.F90 create mode 100755 src/trans/gpu/internal/sufft_mod.F90 create mode 100755 src/trans/gpu/internal/sugaw_mod.F90 create mode 100755 src/trans/gpu/internal/suleg_mod.F90 create mode 100755 src/trans/gpu/internal/sump_trans0_mod.F90 create mode 100755 src/trans/gpu/internal/sump_trans_mod.F90 create mode 100755 src/trans/gpu/internal/sump_trans_preleg_mod.F90 create mode 100755 src/trans/gpu/internal/sumplat_mod.F90 create mode 100755 src/trans/gpu/internal/sumplatb_mod.F90 create mode 100755 src/trans/gpu/internal/sumplatbeq_mod.F90 create mode 100755 src/trans/gpu/internal/sumplatf_mod.F90 create mode 100755 src/trans/gpu/internal/supol_mod.F90 create mode 100755 src/trans/gpu/internal/supolf_mod.F90 create mode 100755 src/trans/gpu/internal/sustaonl_mod.F90 create mode 100755 src/trans/gpu/internal/sutrle_mod.F90 create mode 100755 src/trans/gpu/internal/suwavedi_mod.F90 create mode 100755 src/trans/gpu/internal/tpm_constants.F90 create mode 100755 src/trans/gpu/internal/tpm_ctl.F90 create mode 100755 src/trans/gpu/internal/tpm_dim.F90 create mode 100755 src/trans/gpu/internal/tpm_distr.F90 create mode 100755 src/trans/gpu/internal/tpm_fft.F90 create mode 100755 src/trans/gpu/internal/tpm_fields.F90 create mode 100755 src/trans/gpu/internal/tpm_flt.F90 create mode 100755 src/trans/gpu/internal/tpm_gen.F90 create mode 100755 src/trans/gpu/internal/tpm_geometry.F90 create mode 100755 src/trans/gpu/internal/tpm_hicfft.F90 create mode 100755 src/trans/gpu/internal/tpm_pol.F90 create mode 100644 src/trans/gpu/internal/tpm_stats.F90 create mode 100755 src/trans/gpu/internal/tpm_trans.F90 create mode 100755 src/trans/gpu/internal/trgtol_mod.F90 create mode 100755 src/trans/gpu/internal/trltog_mod.F90 create mode 100755 src/trans/gpu/internal/trltom_mod.F90 create mode 100755 src/trans/gpu/internal/trltom_pack_unpack.F90 create mode 100755 src/trans/gpu/internal/trmtol_mod.F90 create mode 100755 src/trans/gpu/internal/trmtol_pack_unpack.F90 create mode 100755 src/trans/gpu/internal/updsp_mod.F90 create mode 100755 src/trans/gpu/internal/updspb_mod.F90 create mode 100755 src/trans/gpu/internal/uvtvd_mod.F90 create mode 100755 src/trans/gpu/internal/vd2uv_ctl_mod.F90 create mode 100755 src/trans/gpu/internal/vd2uv_mod.F90 create mode 100755 src/trans/gpu/internal/vdtuv_mod.F90 create mode 100755 src/trans/gpu/internal/write_legpol_mod.F90 create mode 100644 src/trans/gpu/sharedmem/sharedmem.c create mode 100644 src/trans/gpu/sharedmem/sharedmem_mod.F90 diff --git a/AUTHORS b/AUTHORS index c8f0a1cc..92f1d05d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,20 +1,23 @@ Authors and Contributors ======================== +- P. Courtier (ECMWF) - W. Deconinck (ECMWF) +- D. Degrauwe (RMI) - D. Dent (ECMWF) - P. Dueben (ECMWF) - R. El Khatib (Meteo France) +- D. Giard (Meteo France) - J. Hague (ECMWF) - M. Hamrud (ECMWF) +- S. Hatfield (ECMWF) +- M. Hortal (ECMWF) - L. Isaksen (ECMWF) -- G. Mozdzynski (ECMWF) - P. Marguinaud (Meteo France) +- O. Marsden (ECMWF) +- L. Mosimann (NVIDIA) +- G. Mozdzynski (ECMWF) - A. Mueller (ECMWF) -- M. Hortal (ECMWF) -- P. Courtier (ECMWF) -- D. Degrauwe (RMI) -- D. Giard (Meteo France) - G. Radnoti (ECMWF) - D. Salmond (ECMWF) - Y. Seity (Meteo France) diff --git a/CMakeLists.txt b/CMakeLists.txt index d293b207..45f3cb7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,25 +6,27 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. -cmake_minimum_required( VERSION 3.12 FATAL_ERROR ) - -# Once we update the minimum required version to 3.20, -# we can use the command `cmake_path` instead of `get_filename_component`. -# For discussion, see https://github.com/ecmwf-ifs/ectrans/pull/61#discussion_r1494362092 +cmake_minimum_required( VERSION 3.18 FATAL_ERROR ) +# CMake 3.17 adds INTERFACE link options which get propagated to the link stage, +# even if the target is linked in privately +# CMake 3.18 allows "LINK_LANG_AND_ID" generator expression. find_package( ecbuild 3.4 REQUIRED HINTS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../ecbuild ) -project( ectrans LANGUAGES C Fortran ) +project( ectrans LANGUAGES C CXX Fortran ) include( ectrans_macros ) +set(CMAKE_CXX_STANDARD 17) + ecbuild_enable_fortran( REQUIRED NO_MODULE_DIRECTORY ) -### Find (optional) dependencies +### Find (optional) dependencies ecbuild_find_package( NAME fiat REQUIRED ) -ecbuild_add_option( FEATURE MPI +ecbuild_add_option( FEATURE MPI DESCRIPTION "Support for MPI distributed memory parallelism" + REQUIRED_PACKAGES "MPI COMPONENTS Fortran CXX" CONDITION fiat_HAVE_MPI ) ecbuild_add_option( FEATURE OMP @@ -32,6 +34,22 @@ ecbuild_add_option( FEATURE OMP DESCRIPTION "Support for OpenMP shared memory parallelism" REQUIRED_PACKAGES "OpenMP COMPONENTS Fortran" ) +if( ${CMAKE_VERSION} VERSION_LESS "3.25" AND (NOT DEFINED ENABLE_ACC OR ENABLE_ACC ) ) + # See https://gitlab.kitware.com/cmake/cmake/-/issues/23691, fixed in CMake 3.25 + # (TL;DR: FindOpenACC sets OpenACC__FOUND correctly but does not set + # OpenACC_FOUND unless all three C, CXX, and Fortran have been found - even if + # only one language has been requested via COMPONENTS) + find_package( OpenACC COMPONENTS Fortran ) + if( OpenACC_Fortran_FOUND ) + set( OpenACC_FOUND ON ) + endif() +endif() + +ecbuild_add_option( FEATURE ACC + DEFAULT OFF + DESCRIPTION "Support for using GPUs with OpenACC" + REQUIRED_PACKAGES "OpenACC COMPONENTS Fortran" ) + ecbuild_add_option( FEATURE DOUBLE_PRECISION DEFAULT ON DESCRIPTION "Support for Double Precision" ) @@ -55,15 +73,90 @@ if( NOT HAVE_MKL ) option( FFTW_ENABLE_MKL OFF ) endif() -ecbuild_find_package( NAME FFTW REQUIRED COMPONENTS double ${single} ) +ecbuild_add_option( FEATURE CPU + DEFAULT ON + DESCRIPTION "Compile CPU version of ectrans" + ) + +if( HAVE_CPU ) + ecbuild_find_package( NAME FFTW REQUIRED COMPONENTS double ${single} ) +endif() ecbuild_add_option( FEATURE TRANSI DEFAULT ON DESCRIPTION "Compile TransI C-interface to trans" - CONDITION HAVE_DOUBLE_PRECISION ) + CONDITION HAVE_DOUBLE_PRECISION AND HAVE_CPU ) + +# Search for available GPU runtimes, searching for CUDA first and, if not found, +# attempt to find HIP +if( ECTRANS_ENABLE_GPU OR (NOT DEFINED ECTRANS_ENABLE_GPU AND ENABLE_GPU)) + set(HAVE_CUDA 0) + set(HAVE_HIP 0) + ectrans_find_cuda() # sets "HAVE_CUDA" + if( NOT HAVE_CUDA ) + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") + ectrans_find_hip() # sets "HAVE_HIP" + else() + message("Cannot enable for HIP language, require CMake >= 3.24") + endif() + endif() +endif() + +ecbuild_add_option( FEATURE GPU + DEFAULT OFF + DESCRIPTION "Compile GPU version of ectrans (Requires OpenACC or sufficient OpenMP offloading support and MPI)" + CONDITION (HAVE_HIP OR HAVE_CUDA) AND (HAVE_ACC OR HAVE_OMP) AND HAVE_MPI ) + +if( HAVE_GPU ) + if( HAVE_ACC ) + set( GPU_OFFLOAD "ACC" ) + elseif( HAVE_OMP ) + set( GPU_OFFLOAD "OMP" ) + else() + ecbuild_error("Could not enable GPU as OMP or ACC were not enabled") + endif() +endif() + +ecbuild_add_option( FEATURE CUTLASS + DEFAULT OFF + CONDITION HAVE_GPU AND HAVE_CUDA AND CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC" + DESCRIPTION "Support for Cutlass BLAS operations" + REQUIRED_PACKAGES "NvidiaCutlass VERSION 2.11" ) + +# following also needs cuda arch sm80 to be effective +ecbuild_add_option( FEATURE CUTLASS_3XTF32 + DEFAULT ON + CONDITION HAVE_SINGLE_PRECISION AND HAVE_CUTLASS + DESCRIPTION "Support for 3xTF32 with Cutlass (>= 2.8) and CUDA_ARCHITECTURES >= 80" ) + +ecbuild_add_option( FEATURE GPU_AWARE_MPI + DEFAULT ON + CONDITION HAVE_GPU + REQUIRED_PACKAGES "MPI COMPONENTS CXX Fortran" + DESCRIPTION "Enable CUDA-aware MPI" ) + +ecbuild_add_option( FEATURE GPU_GRAPHS_GEMM + DEFAULT ON + CONDITION HAVE_GPU + DESCRIPTION "Enable graph-based optimisation of Legendre transform GEMM kernel" ) + +if( BUILD_SHARED_LIBS ) + set( GPU_STATIC_DEFAULT OFF ) +else() + set( GPU_STATIC_DEFAULT ON ) +endif() +ecbuild_add_option( FEATURE GPU_STATIC + DEFAULT ${GPU_STATIC_DEFAULT} + DESCRIPTION "Compile GPU library as static library") ectrans_find_lapack() +ecbuild_add_option( FEATURE TESTS + DEFAULT ON + DESCRIPTION "Enable unit testing" + REQUIRED_PACKAGES "MPI COMPONENTS Fortran" + CONDITION HAVE_CPU ) + ### Add sources and tests include( ectrans_compile_options ) add_subdirectory( src ) @@ -79,4 +172,3 @@ endif() ecbuild_install_project( NAME ${PROJECT_NAME} ) ecbuild_print_summary() - diff --git a/README.md b/README.md index 18b9abd9..90000c1c 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,14 @@ Introduction ============ ecTrans is the global spherical Harmonics transforms library, extracted from the IFS. -It is using a hybrid of MPI and OpenMP parallelisation strategies. -The package contains both single- and double precision Fortran libraries (trans_sp, trans_dp), -as well as a C interface to the double-precision version (transi_dp) +It contains both CPU and GPU (Nvidia) code-paths. +The CPU version uses a hybrid of MPI and OpenMP parallelisation strategies, while the GPU version combines MPI and OpenACC. +A default installation builds both CPU libraries (trans_sp, trans_dp) and various flavours of GPU libraries in (trans_gpu_{sp/dp} shared library, trans_gpu_static_{sp/dp} static library, trans_gpu_static_CA_{sp/dp} static library requiring CUDA-aware MPI implementation), as well as a C interface to the double-precision version (transi_dp). A simple benchmark driver is also built against each of these libraries, allowing simple testing of the transforms. License ======= -Trans is distributed under the Apache License Version 2.0. +ecTrans is distributed under the Apache License Version 2.0. See `LICENSE` file for details. Installing ecTrans @@ -26,6 +26,8 @@ Supported Platforms Other UNIX-like operating systems may work too out of the box. +The GPU codepath has only been tested with NVHPC compilers on recent Nvidia GPUs. + Requirements ------------ - Fortran compiler with OpenMP support @@ -38,6 +40,10 @@ Requirements Further optional recommended dependencies: - FFTW (http://www.fftw.org) +For the GPU libraries : +- Fortran compiler with OpenACC support +- CUDA toolkit (compiler, and CUBLAS and CUFFT libraries) + Building ecTrans ---------------- @@ -69,8 +75,21 @@ Extra options can be added to the `cmake` command to control the build: - `-DENABLE_DOUBLE_PRECISION=` default=ON - `-DENABLE_TRANSI=` default=ON - `-DENABLE_MKL=` default=ON + - `-DENABLE_FFTW=` default=ON + - `-DENABLE_GPU=` default=OFF - `-DCMAKE_INSTALL_PREFIX=` +Specific extra options exist for GPU installation: + - `-DENABLE_GPU_AWARE_MPI=` default=OF + - `-DENABLE_GPU_GRAPHS_GEMM=` default=ON + - `-DENABLE_CUTLASS=` default=OFF + - `-DENABLE_3XTF32=` default=OFF + +GPU-aware MPI allows buffers residing on GPU to be passed to MPI communication calls directly. This requires a compatible MPI installation. +Graph work-flows allow a series of GPU operations to be scheduled in an efficient manner. +This is useful both for the batched FFTs and the batched GEMMs on which ecTrans relies, although for FFTs this is currently relied upon. +Cutlass is an Nvidia library of templates for GEMM operations. 3xTF32 is a specific acceleration for single precision operations, enabled by Cutlass. + More options to control compilation flags, only when defaults are not sufficient - `-DCMAKE_Fortran_FLAGS=` @@ -82,6 +101,10 @@ Optionally, tests can be run to check succesful compilation, when the feature TE $ ctest +The benchmark drivers are found in the bin directory. +A brief description of available command-line arguments can be obtained with e.g. +ectrans-benchmark-cpu-sp --help + Reporting Bugs ============== diff --git a/cmake/FindCUDAToolkit-cmake-3.24/FindCUDAToolkit.cmake b/cmake/FindCUDAToolkit-cmake-3.24/FindCUDAToolkit.cmake new file mode 100644 index 00000000..970d8e90 --- /dev/null +++ b/cmake/FindCUDAToolkit-cmake-3.24/FindCUDAToolkit.cmake @@ -0,0 +1,1026 @@ +# Copy of CMake version 3.24 which adds support for finding cublas and cufft in +# directory parallel to the cuda libraries. +# This file can be deleted when using CMake 3.22 + +ecbuild_warn("Using FindCUDAToolkit backported from cmake 3.24") + +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying +# file Copyright.txt or https://cmake.org/licensing for details. + +#[=======================================================================[.rst: +FindCUDAToolkit +--------------- + +.. versionadded:: 3.17 + +This script locates the NVIDIA CUDA toolkit and the associated libraries, but +does not require the ``CUDA`` language be enabled for a given project. This +module does not search for the NVIDIA CUDA Samples. + +.. versionadded:: 3.19 + QNX support. + +Search Behavior +^^^^^^^^^^^^^^^ + +The CUDA Toolkit search behavior uses the following order: + +1. If the ``CUDA`` language has been enabled we will use the directory + containing the compiler as the first search location for ``nvcc``. + +2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g., + ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it + will be searched. If both an environment variable **and** a + configuration variable are specified, the *configuration* variable takes + precedence. + + The directory specified here must be such that the executable ``nvcc`` or + the appropriate ``version.txt`` file can be found underneath the specified + directory. + +3. If the CUDA_PATH environment variable is defined, it will be searched + for ``nvcc``. + +4. The user's path is searched for ``nvcc`` using :command:`find_program`. If + this is found, no subsequent search attempts are performed. Users are + responsible for ensuring that the first ``nvcc`` to show up in the path is + the desired path in the event that multiple CUDA Toolkits are installed. + +5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is + used. No subsequent search attempts are performed. No default symbolic link + location exists for the Windows platform. + +6. The platform specific default install locations are searched. If exactly one + candidate is found, this is used. The default CUDA Toolkit install locations + searched are: + + +-------------+-------------------------------------------------------------+ + | Platform | Search Pattern | + +=============+=============================================================+ + | macOS | ``/Developer/NVIDIA/CUDA-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Other Unix | ``/usr/local/cuda-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Windows | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` | + +-------------+-------------------------------------------------------------+ + + Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as + ``/usr/local/cuda-9.0`` or + ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`` + + .. note:: + + When multiple CUDA Toolkits are installed in the default location of a + system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0`` + exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this + package is marked as **not** found. + + There are too many factors involved in making an automatic decision in + the presence of multiple CUDA Toolkits being installed. In this + situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or + (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for + :command:`find_program` to find. + +Arguments +^^^^^^^^^ + +``[]`` + The ``[]`` argument requests a version with which the package found + should be compatible. See :ref:`find_package version format ` + for more details. + +Options +^^^^^^^ + +``REQUIRED`` + If specified, configuration will error if a suitable CUDA Toolkit is not + found. + +``QUIET`` + If specified, the search for a suitable CUDA Toolkit will not produce any + messages. + +``EXACT`` + If specified, the CUDA Toolkit is considered found only if the exact + ``VERSION`` specified is recovered. + +Imported targets +^^^^^^^^^^^^^^^^ + +An :ref:`imported target ` named ``CUDA::toolkit`` is provided. + +This module defines :prop_tgt:`IMPORTED` targets for each +of the following libraries that are part of the CUDAToolkit: + +- :ref:`CUDA Runtime Library` +- :ref:`CUDA Driver Library` +- :ref:`cuBLAS` +- :ref:`cuFFT` +- :ref:`cuRAND` +- :ref:`cuSOLVER` +- :ref:`cuSPARSE` +- :ref:`cuPTI` +- :ref:`NPP` +- :ref:`nvBLAS` +- :ref:`nvGRAPH` +- :ref:`nvJPEG` +- :ref:`nvidia-ML` +- :ref:`nvRTC` +- :ref:`nvToolsExt` +- :ref:`OpenCL` +- :ref:`cuLIBOS` + +.. _`cuda_toolkit_rt_lib`: + +CUDA Runtime Library +"""""""""""""""""""" + +The CUDA Runtime library (cudart) are what most applications will typically +need to link against to make any calls such as `cudaMalloc`, and `cudaFree`. + +Targets Created: + +- ``CUDA::cudart`` +- ``CUDA::cudart_static`` + +.. _`cuda_toolkit_driver_lib`: + +CUDA Driver Library +"""""""""""""""""""" + +The CUDA Driver library (cuda) are used by applications that use calls +such as `cuMemAlloc`, and `cuMemFree`. + +Targets Created: + +- ``CUDA::cuda_driver`` + +.. _`cuda_toolkit_cuBLAS`: + +cuBLAS +"""""" + +The `cuBLAS `_ library. + +Targets Created: + +- ``CUDA::cublas`` +- ``CUDA::cublas_static`` +- ``CUDA::cublasLt`` starting in CUDA 10.1 +- ``CUDA::cublasLt_static`` starting in CUDA 10.1 + +.. _`cuda_toolkit_cuFFT`: + +cuFFT +""""" + +The `cuFFT `_ library. + +Targets Created: + +- ``CUDA::cufft`` +- ``CUDA::cufftw`` +- ``CUDA::cufft_static`` +- ``CUDA::cufft_static_nocallback`` starting in CUDA 9.2, requires CMake 3.23+ +- ``CUDA::cufftw_static`` + +cuRAND +"""""" + +The `cuRAND `_ library. + +Targets Created: + +- ``CUDA::curand`` +- ``CUDA::curand_static`` + +.. _`cuda_toolkit_cuSOLVER`: + +cuSOLVER +"""""""" + +The `cuSOLVER `_ library. + +Targets Created: + +- ``CUDA::cusolver`` +- ``CUDA::cusolver_static`` + +.. _`cuda_toolkit_cuSPARSE`: + +cuSPARSE +"""""""" + +The `cuSPARSE `_ library. + +Targets Created: + +- ``CUDA::cusparse`` +- ``CUDA::cusparse_static`` + +.. _`cuda_toolkit_cupti`: + +cupti +""""" + +The `NVIDIA CUDA Profiling Tools Interface `_. + +Targets Created: + +- ``CUDA::cupti`` +- ``CUDA::cupti_static`` + +.. _`cuda_toolkit_NPP`: + +NPP +""" + +The `NPP `_ libraries. + +Targets Created: + +- `nppc`: + + - ``CUDA::nppc`` + - ``CUDA::nppc_static`` + +- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h` + + - ``CUDA::nppial`` + - ``CUDA::nppial_static`` + +- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h` + + - ``CUDA::nppicc`` + - ``CUDA::nppicc_static`` + +- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h` + Removed starting in CUDA 11.0, use :ref:`nvJPEG` instead. + + - ``CUDA::nppicom`` + - ``CUDA::nppicom_static`` + +- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h` + + - ``CUDA::nppidei`` + - ``CUDA::nppidei_static`` + +- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h` + + - ``CUDA::nppif`` + - ``CUDA::nppif_static`` + +- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h` + + - ``CUDA::nppig`` + - ``CUDA::nppig_static`` + +- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h` + + - ``CUDA::nppim`` + - ``CUDA::nppim_static`` + +- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h` + + - ``CUDA::nppist`` + - ``CUDA::nppist_static`` + +- `nppisu`: Memory support functions in `nppi_support_functions.h` + + - ``CUDA::nppisu`` + - ``CUDA::nppisu_static`` + +- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h` + + - ``CUDA::nppitc`` + - ``CUDA::nppitc_static`` + +- `npps`: + + - ``CUDA::npps`` + - ``CUDA::npps_static`` + +.. _`cuda_toolkit_nvBLAS`: + +nvBLAS +"""""" + +The `nvBLAS `_ libraries. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvblas`` + +.. _`cuda_toolkit_nvGRAPH`: + +nvGRAPH +""""""" + +The `nvGRAPH `_ library. +Removed starting in CUDA 11.0 + +Targets Created: + +- ``CUDA::nvgraph`` +- ``CUDA::nvgraph_static`` + + +.. _`cuda_toolkit_nvJPEG`: + +nvJPEG +"""""" + +The `nvJPEG `_ library. +Introduced in CUDA 10. + +Targets Created: + +- ``CUDA::nvjpeg`` +- ``CUDA::nvjpeg_static`` + +.. _`cuda_toolkit_nvRTC`: + +nvRTC +""""" + +The `nvRTC `_ (Runtime Compilation) library. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvrtc`` + +.. _`cuda_toolkit_nvml`: + +nvidia-ML +""""""""" + +The `NVIDIA Management Library `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvml`` + +.. _`cuda_toolkit_nvToolsExt`: + +nvToolsExt +"""""""""" + +The `NVIDIA Tools Extension `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvToolsExt`` + +.. _`cuda_toolkit_opencl`: + +OpenCL +"""""" + +The `NVIDIA OpenCL Library `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::OpenCL`` + +.. _`cuda_toolkit_cuLIBOS`: + +cuLIBOS +""""""" + +The cuLIBOS library is a backend thread abstraction layer library which is +static only. The ``CUDA::cublas_static``, ``CUDA::cusparse_static``, +``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP +libraries all automatically have this dependency linked. + +Target Created: + +- ``CUDA::culibos`` + +**Note**: direct usage of this target by consumers should not be necessary. + +.. _`cuda_toolkit_cuRAND`: + + + +Result variables +^^^^^^^^^^^^^^^^ + +``CUDAToolkit_FOUND`` + A boolean specifying whether or not the CUDA Toolkit was found. + +``CUDAToolkit_VERSION`` + The exact version of the CUDA Toolkit found (as reported by + ``nvcc --version`` or ``version.txt``). + +``CUDAToolkit_VERSION_MAJOR`` + The major version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_MINOR`` + The minor version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_PATCH`` + The patch version of the CUDA Toolkit. + +``CUDAToolkit_BIN_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + executable ``nvcc``. + +``CUDAToolkit_INCLUDE_DIRS`` + The path to the CUDA Toolkit ``include`` folder containing the header files + required to compile a project linking against CUDA. + +``CUDAToolkit_LIBRARY_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + Runtime library ``cudart``. + +``CUDAToolkit_LIBRARY_ROOT`` + .. versionadded:: 3.18 + + The path to the CUDA Toolkit directory containing the nvvm directory and + version.txt. + +``CUDAToolkit_TARGET_DIR`` + The path to the CUDA Toolkit directory including the target architecture + when cross-compiling. When not cross-compiling this will be equivalent to + the parent directory of ``CUDAToolkit_BIN_DIR``. + +``CUDAToolkit_NVCC_EXECUTABLE`` + The path to the NVIDIA CUDA compiler ``nvcc``. Note that this path may + **not** be the same as + :variable:`CMAKE_CUDA_COMPILER _COMPILER>`. ``nvcc`` must be + found to determine the CUDA Toolkit version as well as determining other + features of the Toolkit. This variable is set for the convenience of + modules that depend on this one. + + +#]=======================================================================] + +# NOTE: much of this was simply extracted from FindCUDA.cmake. + +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# Copyright (c) 2007-2009 +# Scientific Computing and Imaging Institute, University of Utah +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +# The toolkit is located during compiler detection for CUDA and stored in CMakeCUDACompiler.cmake as +# CMAKE_CUDA_COMPILER_TOOLKIT_ROOT and CMAKE_CUDA_COMPILER_LIBRARY_ROOT. +# We compute the rest based on those here to avoid re-searching and to avoid finding a possibly +# different installation. +if(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT) + set(CUDAToolkit_ROOT_DIR "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}") + set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_LIBRARY_ROOT}") + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_ROOT_DIR}/bin") + set(CUDAToolkit_NVCC_EXECUTABLE "${CUDAToolkit_BIN_DIR}/nvcc${CMAKE_EXECUTABLE_SUFFIX}") + set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}") + + if(CUDAToolkit_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + endif() +else() + function(_CUDAToolkit_find_root_dir ) + cmake_parse_arguments(arg "" "" "SEARCH_PATHS;FIND_FLAGS" ${ARGN}) + + if(NOT CUDAToolkit_BIN_DIR) + if(NOT CUDAToolkit_SENTINEL_FILE) + find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ${arg_SEARCH_PATHS} + ${arg_FIND_FLAGS} + ) + endif() + + if(NOT CUDAToolkit_NVCC_EXECUTABLE) + find_file(CUDAToolkit_SENTINEL_FILE + NAMES version.txt + PATHS ${arg_SEARCH_PATHS} + NO_DEFAULT_PATH + ) + endif() + + if(EXISTS "${CUDAToolkit_NVCC_EXECUTABLE}") + # If NVCC exists then invoke it to find the toolkit location. + # This allows us to support wrapper scripts (e.g. ccache or colornvcc), CUDA Toolkit, + # NVIDIA HPC SDK, and distro's splayed layouts + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "-v" "__cmake_determine_cuda" + OUTPUT_VARIABLE _CUDA_NVCC_OUT ERROR_VARIABLE _CUDA_NVCC_OUT) + if(_CUDA_NVCC_OUT MATCHES "\\#\\$ TOP=([^\r\n]*)") + get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_MATCH_1}/bin" ABSOLUTE) + else() + get_filename_component(CUDAToolkit_BIN_DIR "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY) + endif() + unset(_CUDA_NVCC_OUT) + + mark_as_advanced(CUDAToolkit_BIN_DIR) + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE) + endif() + + if(CUDAToolkit_SENTINEL_FILE) + get_filename_component(CUDAToolkit_BIN_DIR ${CUDAToolkit_SENTINEL_FILE} DIRECTORY ABSOLUTE) + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}/bin") + + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE) + mark_as_advanced(CUDAToolkit_BIN_DIR) + endif() + endif() + + if(CUDAToolkit_BIN_DIR) + get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) + set(CUDAToolkit_ROOT_DIR "${CUDAToolkit_ROOT_DIR}" PARENT_SCOPE) + endif() + + endfunction() + + function(_CUDAToolkit_find_version_file result_variable) + # We first check for a non-scattered installation to prefer it over a scattered installation. + if(CUDAToolkit_ROOT AND EXISTS "${CUDAToolkit_ROOT}/version.txt") + set(${result_variable} "${CUDAToolkit_ROOT}/version.txt" PARENT_SCOPE) + elseif(CUDAToolkit_ROOT_DIR AND EXISTS "${CUDAToolkit_ROOT_DIR}/version.txt") + set(${result_variable} "${CUDAToolkit_ROOT_DIR}/version.txt" PARENT_SCOPE) + elseif(CMAKE_SYSROOT_LINK AND EXISTS "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/version.txt") + set(${result_variable} "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/version.txt" PARENT_SCOPE) + elseif(EXISTS "${CMAKE_SYSROOT}/usr/lib/cuda/version.txt") + set(${result_variable} "${CMAKE_SYSROOT}/usr/lib/cuda/version.txt" PARENT_SCOPE) + endif() + endfunction() + + # For NVCC we can easily deduce the SDK binary directory from the compiler path. + if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") + get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_CUDA_COMPILER}" DIRECTORY) + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "") + # Try language provided path first. + _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_BIN_DIR}" FIND_FLAGS NO_DEFAULT_PATH) + mark_as_advanced(CUDAToolkit_BIN_DIR) + endif() + + # Try user provided path + if(NOT CUDAToolkit_ROOT_DIR AND CUDAToolkit_ROOT) + _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_ROOT}" FIND_FLAGS PATH_SUFFIXES bin NO_DEFAULT_PATH) + endif() + if(NOT CUDAToolkit_ROOT_DIR) + _CUDAToolkit_find_root_dir(FIND_FLAGS PATHS ENV CUDA_PATH PATH_SUFFIXES bin) + endif() + + # If the user specified CUDAToolkit_ROOT but the toolkit could not be found, this is an error. + if(NOT CUDAToolkit_ROOT_DIR AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) + # Declare error messages now, print later depending on find_package args. + set(fail_base "Could not find nvcc executable in path specified by") + set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") + set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") + + if(CUDAToolkit_FIND_REQUIRED) + if(DEFINED CUDAToolkit_ROOT) + message(FATAL_ERROR ${cuda_root_fail}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) + message(FATAL_ERROR ${env_cuda_root_fail}) + endif() + else() + if(NOT CUDAToolkit_FIND_QUIETLY) + if(DEFINED CUDAToolkit_ROOT) + message(STATUS ${cuda_root_fail}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) + message(STATUS ${env_cuda_root_fail}) + endif() + endif() + set(CUDAToolkit_FOUND FALSE) + unset(fail_base) + unset(cuda_root_fail) + unset(env_cuda_root_fail) + return() + endif() + endif() + + # CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults. + # + # - Linux: /usr/local/cuda-X.Y + # - macOS: /Developer/NVIDIA/CUDA-X.Y + # - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y + # + # We will also search the default symlink location /usr/local/cuda first since + # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked + # directory is the desired location. + if(NOT CUDAToolkit_ROOT_DIR) + if(UNIX) + if(NOT APPLE) + set(platform_base "/usr/local/cuda-") + else() + set(platform_base "/Developer/NVIDIA/CUDA-") + endif() + else() + set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v") + endif() + + # Build out a descending list of possible cuda installations, e.g. + file(GLOB possible_paths "${platform_base}*") + # Iterate the glob results and create a descending list. + set(versions) + foreach(p ${possible_paths}) + # Extract version number from end of string + string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) + if(IS_DIRECTORY ${p} AND p_version) + list(APPEND versions ${p_version}) + endif() + endforeach() + + # Sort numerically in descending order, so we try the newest versions first. + list(SORT versions COMPARE NATURAL ORDER DESCENDING) + + # With a descending list of versions, populate possible paths to search. + set(search_paths) + foreach(v ${versions}) + list(APPEND search_paths "${platform_base}${v}") + endforeach() + + # Force the global default /usr/local/cuda to the front on Unix. + if(UNIX) + list(INSERT search_paths 0 "/usr/local/cuda") + endif() + + # Now search for the toolkit again using the platform default search paths. + _CUDAToolkit_find_root_dir(SEARCH_PATHS "${search_paths}" FIND_FLAGS PATH_SUFFIXES bin) + + # We are done with these variables now, cleanup for caller. + unset(platform_base) + unset(possible_paths) + unset(versions) + unset(search_paths) + + if(NOT CUDAToolkit_ROOT_DIR) + if(CUDAToolkit_FIND_REQUIRED) + message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") + elseif(NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") + endif() + + set(CUDAToolkit_FOUND FALSE) + return() + endif() + endif() + + _CUDAToolkit_find_version_file( _CUDAToolkit_version_file ) + if(_CUDAToolkit_version_file) + # CUDAToolkit_LIBRARY_ROOT contains the device library and version file. + get_filename_component(CUDAToolkit_LIBRARY_ROOT "${_CUDAToolkit_version_file}" DIRECTORY ABSOLUTE) + endif() + unset(_CUDAToolkit_version_file) + + if(CUDAToolkit_NVCC_EXECUTABLE AND + CMAKE_CUDA_COMPILER_VERSION AND + CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) + # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value + # This if statement will always match, but is used to provide variables for MATCH 1,2,3... + if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}") + endif() + elseif(CUDAToolkit_NVCC_EXECUTABLE) + # Compute the version by invoking nvcc + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + endif() + unset(NVCC_OUT) + else() + _CUDAToolkit_find_version_file(version_file) + if(version_file) + file(READ "${version_file}" VERSION_INFO) + if(VERSION_INFO MATCHES [=[CUDA Version ([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + endif() + endif() + endif() +endif() + +# Find target directory when crosscompiling. +if(CMAKE_CROSSCOMPILING) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") + # Support for NVPACK + set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") + set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + if(ANDROID_ARCH_NAME STREQUAL "arm64") + set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi") + elseif (CMAKE_SYSTEM_NAME STREQUAL "QNX") + set(CUDAToolkit_TARGET_NAME "aarch64-qnx") + else() + set(CUDAToolkit_TARGET_NAME "aarch64-linux") + endif(ANDROID_ARCH_NAME STREQUAL "arm64") + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(CUDAToolkit_TARGET_NAME "x86_64-linux") + endif() + + if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + # add known CUDA target root path to the set of directories we search for programs, libraries and headers + list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") + + # Mark that we need to pop the root search path changes after we have + # found all cuda libraries so that searches for our cross-compilation + # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or + # PATh + set(_CUDAToolkit_Pop_ROOT_PATH True) + endif() +endif() + +# If not already set we can simply use the toolkit root or it's a scattered installation. +if(NOT CUDAToolkit_TARGET_DIR) + # Not cross compiling + set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}") + # Now that we have the real ROOT_DIR, find components inside it. + list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR}) + + # Mark that we need to pop the prefix path changes after we have + # found the cudart library. + set(_CUDAToolkit_Pop_Prefix True) +endif() + +# CUDAToolkit_TARGET_DIR always points to the directory containing the include directory. +# On a scattered installation /usr, on a non-scattered something like /usr/local/cuda or /usr/local/cuda-10.2/targets/aarch64-linux. +if(EXISTS "${CUDAToolkit_TARGET_DIR}/include/cuda_runtime.h") + set(CUDAToolkit_INCLUDE_DIR "${CUDAToolkit_TARGET_DIR}/include") +elseif(NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Unable to find cuda_runtime.h in \"${CUDAToolkit_TARGET_DIR}/include\" for CUDAToolkit_INCLUDE_DIR.") +endif() + +# The NVHPC layout moves math library headers and libraries to a sibling directory. +# Create a separate variable so this directory can be selectively added to math targets. +if(NOT EXISTS "${CUDAToolkit_INCLUDE_DIR}/cublas_v2.h") + set(CUDAToolkit_MATH_INCLUDE_DIR "${CUDAToolkit_TARGET_DIR}/../../math_libs/include") + cmake_path(NORMAL_PATH CUDAToolkit_MATH_INCLUDE_DIR) + if(NOT EXISTS "${CUDAToolkit_MATH_INCLUDE_DIR}/cublas_v2.h") + if(NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Unable to find cublas_v2.h in either \"${CUDAToolkit_INCLUDE_DIR}\" or \"${CUDAToolkit_MATH_INCLUDE_DIR}\"") + endif() + unset(CUDAToolkit_MATH_INCLUDE_DIR) + endif() +endif() + +# Find the CUDA Runtime Library libcudart +find_library(CUDA_CUDART + NAMES cudart + PATH_SUFFIXES lib64 lib/x64 +) +find_library(CUDA_CUDART + NAMES cudart + PATH_SUFFIXES lib64/stubs lib/x64/stubs +) + +if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Unable to find cudart library.") +endif() + +if(_CUDAToolkit_Pop_Prefix) + list(REMOVE_AT CMAKE_PREFIX_PATH -1) + unset(_CUDAToolkit_Pop_Prefix) +endif() + +#----------------------------------------------------------------------------- +# Perform version comparison and validate all required variables are set. +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CUDAToolkit + REQUIRED_VARS + CUDAToolkit_INCLUDE_DIR + CUDA_CUDART + CUDAToolkit_BIN_DIR + VERSION_VAR + CUDAToolkit_VERSION +) + +unset(CUDAToolkit_ROOT_DIR) +mark_as_advanced(CUDA_CUDART + CUDAToolkit_INCLUDE_DIR + CUDAToolkit_NVCC_EXECUTABLE + CUDAToolkit_SENTINEL_FILE + ) + +#----------------------------------------------------------------------------- +# Construct result variables +if(CUDAToolkit_FOUND) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) +endif() + +#----------------------------------------------------------------------------- +# Construct import targets +if(CUDAToolkit_FOUND) + + function(_CUDAToolkit_find_and_add_import_lib lib_name) + cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES;EXTRA_INCLUDE_DIRS" ${ARGN}) + + set(search_names ${lib_name} ${arg_ALT}) + + find_library(CUDA_${lib_name}_LIBRARY + NAMES ${search_names} + HINTS ${CUDAToolkit_LIBRARY_DIR} + ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib + ${arg_EXTRA_PATH_SUFFIXES} + ) + # Don't try any stub directories until we have exhausted all other + # search locations. + find_library(CUDA_${lib_name}_LIBRARY + NAMES ${search_names} + HINTS ${CUDAToolkit_LIBRARY_DIR} + ENV CUDA_PATH + PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs + # Support NVHPC splayed math library layout + ../../math_libs/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/lib64 + ../../math_libs/lib64 + ) + + mark_as_advanced(CUDA_${lib_name}_LIBRARY) + + if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + add_library(CUDA::${lib_name} UNKNOWN IMPORTED) + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + if(DEFINED CUDAToolkit_MATH_INCLUDE_DIR) + string(FIND ${CUDA_${lib_name}_LIBRARY} "math_libs" math_libs) + if(NOT ${math_libs} EQUAL -1) + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_MATH_INCLUDE_DIR}") + endif() + endif() + set_property(TARGET CUDA::${lib_name} PROPERTY IMPORTED_LOCATION "${CUDA_${lib_name}_LIBRARY}") + foreach(dep ${arg_DEPS}) + if(TARGET CUDA::${dep}) + target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) + endif() + endforeach() + if(arg_EXTRA_INCLUDE_DIRS) + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${arg_EXTRA_INCLUDE_DIRS}") + endif() + endif() + endfunction() + + if(NOT TARGET CUDA::toolkit) + add_library(CUDA::toolkit IMPORTED INTERFACE) + target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") + endif() + + _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) + + _CUDAToolkit_find_and_add_import_lib(cudart) + _CUDAToolkit_find_and_add_import_lib(cudart_static) + + # setup dependencies that are required for cudart_static when building + # on linux. These are generally only required when using the CUDA toolkit + # when CUDA language is disabled + if(NOT TARGET CUDA::cudart_static_deps + AND TARGET CUDA::cudart_static) + + add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) + target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) + + if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER)) + find_package(Threads REQUIRED) + target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) + endif() + + if(UNIX AND NOT APPLE AND NOT (CMAKE_SYSTEM_NAME STREQUAL "QNX")) + # On Linux, you must link against librt when using the static cuda runtime. + find_library(CUDAToolkit_rt_LIBRARY rt) + mark_as_advanced(CUDAToolkit_rt_LIBRARY) + if(NOT CUDAToolkit_rt_LIBRARY) + message(WARNING "Could not find librt library, needed by CUDA::cudart_static") + else() + target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) + endif() + endif() + endif() + + _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library + foreach (cuda_lib cublasLt cublas cufft curand cusparse nppc nvjpeg) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + endforeach() + + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.0.0) + # cublas depends on cublasLt + # https://docs.nvidia.com/cuda/archive/11.0/cublas/index.html#static-library + _CUDAToolkit_find_and_add_import_lib(cublas DEPS cublasLt) + _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS cublasLt_static) + endif() + + # cuFFTW depends on cuFFT + _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) + _CUDAToolkit_find_and_add_import_lib(cufftw_static DEPS cufft_static) + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 9.2) + _CUDAToolkit_find_and_add_import_lib(cufft_static_nocallback DEPS culibos) + endif() + + # cuSOLVER depends on cuBLAS, and cuSPARSE + _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) + _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) + + + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.1.2) + # cusolver depends on liblapack_static.a starting with CUDA 10.1 update 2, + # https://docs.nvidia.com/cuda/archive/11.5.0/cusolver/index.html#static-link-lapack + _CUDAToolkit_find_and_add_import_lib(cusolver_lapack_static ALT lapack_static) # implementation detail static lib + _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cusolver_lapack_static) + endif() + + if(CUDAToolkit_VERSION VERSION_GREATER 11.2.1) + # cusolver depends on libcusolver_metis and cublasLt + # https://docs.nvidia.com/cuda/archive/11.2.2/cusolver/index.html#link-dependency + _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublasLt) + + _CUDAToolkit_find_and_add_import_lib(cusolver_metis_static ALT metis_static) # implementation detail static lib + _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cusolver_metis_static cublasLt_static) + endif() + + # nvGRAPH depends on cuRAND, and cuSOLVER. + _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + + # Process the majority of the NPP libraries. + foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + endforeach() + + find_path(CUDAToolkit_CUPTI_INCLUDE_DIR cupti.h PATHS + "${CUDAToolkit_ROOT_DIR}/extras/CUPTI/include" + "${CUDAToolkit_INCLUDE_DIR}/../extras/CUPTI/include" + "${CUDATookit_INCLUDE_DIR}" + NO_DEFAULT_PATH) + + if(CUDAToolkit_CUPTI_INCLUDE_DIR) + _CUDAToolkit_find_and_add_import_lib(cupti + EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ + ../extras/CUPTI/lib/ + EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}") + _CUDAToolkit_find_and_add_import_lib(cupti_static + EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ + ../extras/CUPTI/lib/ + EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}") + endif() + + _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) + + _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + + if(WIN32) + # nvtools can be installed outside the CUDA toolkit directory + # so prefer the NVTOOLSEXT_PATH windows only environment variable + # In addition on windows the most common name is nvToolsExt64_1 + find_library(CUDA_nvToolsExt_LIBRARY + NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt + PATHS ENV NVTOOLSEXT_PATH + ENV CUDA_PATH + PATH_SUFFIXES lib/x64 lib + ) + endif() + _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + + _CUDAToolkit_find_and_add_import_lib(OpenCL) +endif() + +if(_CUDAToolkit_Pop_ROOT_PATH) + list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0) + unset(_CUDAToolkit_Pop_ROOT_PATH) +endif() diff --git a/cmake/ectrans_compile_options.cmake b/cmake/ectrans_compile_options.cmake index 351e54a8..6815d6da 100644 --- a/cmake/ectrans_compile_options.cmake +++ b/cmake/ectrans_compile_options.cmake @@ -14,6 +14,17 @@ elseif( CMAKE_Fortran_COMPILER_ID MATCHES "GNU" ) if( NOT CMAKE_Fortran_COMPILER_VERSION VERSION_LESS 10 ) ecbuild_add_fortran_flags("-fallow-argument-mismatch") endif() +elseif( CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC" ) + ecbuild_add_fortran_flags("-Mlarge_arrays") + + # should really be part of configuration, or ecbuild default? + ecbuild_add_fortran_flags("-traceback" BUILD DEBUG ) + ecbuild_add_fortran_flags("-fast" BUILD RELEASE ) + ecbuild_add_fortran_flags("-gopt -fast" BUILD RELWITHDEBINFO ) +elseif( CMAKE_Fortran_COMPILER_ID MATCHES "Cray" ) + ecbuild_add_fortran_flags("-hnomessage=878") # A module named ... has already been directly or indirectly use associated into this scope + ecbuild_add_fortran_flags("-hnomessage=867") # Module ... has no public objects declared in the module, therefore nothing can be use associated from the module. + ecbuild_add_fortran_flags("-M7256") # An OpenMP parallel construct in a target region is limited to a single thread. elseif( CMAKE_Fortran_COMPILER_ID MATCHES "Intel" ) ecbuild_add_fortran_flags("-march=core-avx2 -no-fma" BUILD BIT) ecbuild_add_fortran_flags("-fast-transcendentals -fp-model precise -fp-speculation=safe") diff --git a/cmake/ectrans_find_cuda.cmake b/cmake/ectrans_find_cuda.cmake new file mode 100644 index 00000000..7eaeddf3 --- /dev/null +++ b/cmake/ectrans_find_cuda.cmake @@ -0,0 +1,25 @@ +macro( ectrans_find_cuda ) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + ecbuild_info("CMAKE_CUDA_ARCHITECTURES not defined, using 80") + set(CMAKE_CUDA_ARCHITECTURES 80) + endif() + check_language(CUDA) + if ( NOT CMAKE_CUDA_COMPILER ) + set( HAVE_CUDA OFF ) + else() + enable_language(CUDA) + set( HAVE_CUDA ON ) + find_package( CUDAToolkit ) + if( NOT TARGET CUDA::cublas ) + ecbuild_info("No target CUDA::cublas") + set( HAVE_CUDA OFF ) + endif() + if( NOT TARGET CUDA::cufft ) + ecbuild_info("No target CUDA::cufft") + set( HAVE_CUDA OFF ) + endif() + ecbuild_info( "cuda arch : [${CMAKE_CUDA_ARCHITECTURES}]" ) + ecbuild_info( "cublas : [${CUDA_cublas_LIBRARY}]" ) + ecbuild_info( "cufft : [${CUDA_cufft_LIBRARY}]" ) + endif() +endmacro() diff --git a/cmake/ectrans_find_hip.cmake b/cmake/ectrans_find_hip.cmake new file mode 100644 index 00000000..90fca116 --- /dev/null +++ b/cmake/ectrans_find_hip.cmake @@ -0,0 +1,150 @@ +# (C) Copyright 2020- ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +macro( ectrans_find_hip ) + # This macro finds all HIP related libraries, if found, HAVE_HIP=1 + + cmake_minimum_required( VERSION 3.24 FATAL_ERROR ) + + set( options "" ) + set( single_value_args REQUIRED ) + set( multi_value_args "" ) + cmake_parse_arguments( _PAR "${options}" "${single_value_args}" "${multi_value_args}" ${_FIRST_ARG} ${ARGN} ) + + set(HIP_REQUIRED "") + if( _PAR_REQUIRED ) + set(HIP_REQUIRED "REQUIRED" ) + endif() + + set(HAVE_HIP 1) + + # Setup ROCM_PATH + if (NOT DEFINED ROCM_PATH ) + find_path(ROCM_PATH + hip + ENV{ROCM_DIR} + ENV{ROCM_PATH} + ENV{HIP_PATH} + ${HIP_PATH}/.. + ${HIP_ROOT_DIR}/../ + ${ROCM_ROOT_DIR} + /opt/rocm) + endif() + ecbuild_info("ROCM path: ${ROCM_PATH}") + # Update CMAKE_PREFIX_PATH to make sure all the configs that hip depends on are found. + set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH};${ROCM_PATH}") + + set(HAVE_HIP 1) + + include(CheckLanguage) + check_language(HIP) + ecbuild_add_option( FEATURE HIP_LANGUAGE DEFAULT ON CONDITION CMAKE_HIP_COMPILER ) + + if( HAVE_HIP_LANGUAGE ) + if(NOT CMAKE_HIP_COMPILER) + if( _PAR_REQUIRED ) + ecbuild_error("HIP compiler not found") + else() + ecbuild_info("HIP compiler not found: HAVE_HIP=0") + set(HAVE_HIP 0) + endif() + else() + enable_language(HIP) + ecbuild_info("HIP compiler found: ${CMAKE_HIP_COMPILER}") + ecbuild_info("HIP target architecture: ${CMAKE_HIP_ARCHITECTURES}") + endif() + + # Find HIP libraries + find_package(hip REQUIRED CONFIG) + if( NOT hip_FOUND ) + ecbuild_info("hip libraries not found: HAVE_HIP=0") + set( HAVE_HIP 0 ) + endif() + + ecbuild_info("HIP version: ${hip_VERSION}") + + else() + ecbuild_info("HIP sources will be compiled with C++ compiler with added flags") + ecbuild_info("HIP target architecture: ${CMAKE_HIP_ARCHITECTURES}") + enable_language(CXX) + set(CMAKE_MODULE_PATH $ENV{HIP_ROOT}/cmake ${CMAKE_MODULE_PATH}) + find_package(HIP) + if ( NOT HIP_FOUND ) + ecbuild_info("HIP not found: HAVE_HIP=0") + set( HAVE_HIP 0) + endif() + ecbuild_info("HIP version: ${HIP_VERSION}") + + endif() + + if( HAVE_HIP ) + find_package(hipblas CONFIG ${HIP_REQUIRED}) + if( NOT hipblas_FOUND ) + ecbuild_info("hipblas libraries not found: HAVE_HIP=0") + set( HAVE_HIP 0 ) + endif() + + find_package(hipfft CONFIG ${HIP_REQUIRED}) + if( NOT hipfft_FOUND ) + ecbuild_info("hipfft libraries not found: HAVE_HIP=0") + set( HAVE_HIP 0 ) + endif() + + find_package(rocblas CONFIG ${HIP_REQUIRED}) + if( NOT rocblas_FOUND ) + ecbuild_info("rocblas libraries not found: HAVE_HIP=0") + set( HAVE_HIP 0 ) + endif() + + find_package(rocfft CONFIG ${HIP_REQUIRED}) + if( NOT rocfft_FOUND ) + ecbuild_info("rocfft libraries not found: HAVE_HIP=0") + set( HAVE_HIP 0 ) + endif() + + if( HAVE_HIP ) + list( APPEND ECTRANS_GPU_HIP_LIBRARIES ${hipblas_LIBRARIES} ${hipfft_LIBRARIES}) + list( APPEND ECTRANS_GPU_HIP_LIBRARIES ${rocblas_LIBRARIES} ${rocfft_LIBRARIES}) + endif() + + endif() + ecbuild_info("HIP libraries: ${ECTRANS_GPU_HIP_LIBRARIES}") +endmacro() + +macro( ectrans_declare_hip_sources ) + set( options QUIET ) + set( single_value_args "" ) + set( multi_value_args SOURCES SOURCES_GLOB ) + cmake_parse_arguments( _PAR "${options}" "${single_value_args}" "${multi_value_args}" ${_FIRST_ARG} ${ARGN} ) + set( source_files ${_PAR_SOURCES} ) + if( _PAR_SOURCES_GLOB ) + ecbuild_list_add_pattern( LIST source_files + GLOB ${_PAR_SOURCES_GLOB} + QUIET ) + endif() + + if( HAVE_HIP_LANGUAGE ) + if(NOT _PAR_QUIET) + ecbuild_info("Applying HIP language to ${source_files}") + endif() + set_source_files_properties( ${source_files} PROPERTIES LANGUAGE HIP ) + else() + if(NOT _PAR_QUIET) + ecbuild_info("Applying HIP flags to ${source_files}") + endif() + set_source_files_properties( ${source_files} PROPERTIES LANGUAGE CXX ) + set( _flags "-x hip" ) + if( CMAKE_HIP_FLAGS ) + set( _flags "${_flags} ${CMAKE_HIP_FLAGS}" ) + endif() + if( CMAKE_HIP_ARCHITECTURES ) + set( _flags "${_flags} --offload-arch=${CMAKE_HIP_ARCHITECTURES}" ) + endif() + set_source_files_properties( ${source_files} PROPERTIES COMPILE_FLAGS "${_flags}" ) + endif() +endmacro() diff --git a/cmake/ectrans_macros.cmake b/cmake/ectrans_macros.cmake index c8b81cce..56a9343a 100644 --- a/cmake/ectrans_macros.cmake +++ b/cmake/ectrans_macros.cmake @@ -6,6 +6,11 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. +if( CMAKE_VERSION VERSION_LESS 3.22.0 ) + # FindCUDAToolkit from cmake < 3.22 does not support recent NVHPC directory structures + set( CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/FindCUDAToolkit-cmake-3.24 ${CMAKE_MODULE_PATH} ) +endif() + ### Workaround to extract GIT_SHA1 from parent directory if( NOT ${PROJECT_NAME}_GIT_SHA1 ) get_filename_component( PARENT_DIR ${PROJECT_SOURCE_DIR} DIRECTORY ) @@ -20,4 +25,7 @@ endif() include( ectrans_target_fortran_module_directory ) include( ectrans_find_lapack ) +include( ectrans_find_cuda ) +include( ectrans_find_hip ) +include( CheckLanguage ) diff --git a/cmake/project_summary.cmake b/cmake/project_summary.cmake index 346025bb..be8f8b78 100644 --- a/cmake/project_summary.cmake +++ b/cmake/project_summary.cmake @@ -10,8 +10,9 @@ ecbuild_info( "build type : [${CMAKE_BUILD_TYPE}]" ) set( Fortran_flags_str "Fortran flags" ) set( C_flags_str "C flags " ) set( CXX_flags_str "C++ flags " ) +set( HIP_flags_str "HIP flags " ) string( TOUPPER ${PROJECT_NAME} PNAME ) - foreach( lang Fortran C CXX ) + foreach( lang Fortran C CXX HIP ) set( flags "${CMAKE_${lang}_FLAGS} ${CMAKE_${lang}_FLAGS_${CMAKE_BUILD_TYPE_CAPS}} ${${PNAME}_${lang}_FLAGS} ${${PNAME}_${lang}_FLAGS_${CMAKE_BUILD_TYPE_CAPS}}" ) string(REGEX REPLACE "[ ]+" " " flags ${flags}) string(STRIP "${flags}" flags) @@ -21,6 +22,10 @@ ecbuild_info( "OMP" ) foreach( lang Fortran ) ecbuild_info( " OpenMP_${lang}_FLAGS : [${OpenMP_${lang}_FLAGS}]" ) endforeach() +ecbuild_info( "ACC" ) + foreach( lang Fortran ) +ecbuild_info( " OpenACC_${lang}_FLAGS : [${OpenACC_${lang}_FLAGS}]" ) + endforeach() ecbuild_info( "BLAS/LAPACK" ) if( HAVE_SINGLE_PRECISION AND HAVE_DOUBLE_PRECISION AND ECTRANS_CRAYHACK_DOUBLE_PRECISION_WITHOUT_MKL ) ecbuild_info( " trans_dp : [${LAPACK_dp}]" ) diff --git a/src/programs/CMakeLists.txt b/src/programs/CMakeLists.txt index fb551a48..f0db3334 100644 --- a/src/programs/CMakeLists.txt +++ b/src/programs/CMakeLists.txt @@ -6,42 +6,42 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. -if( HAVE_TOOLS ) +#foreach( program ectrans-benchmark ectrans-benchmark-ifs ) +foreach( program ectrans-benchmark ) - if( HAVE_DOUBLE_PRECISION ) - set( trans trans_dp ) - else() - set( trans trans_sp ) + if ( HAVE_CPU ) + foreach( prec dp sp ) + if( HAVE_${prec} ) + ecbuild_add_executable( TARGET ${program}-cpu-${prec} + SOURCES ${program}.F90 + LINKER_LANGUAGE Fortran + LIBS + fiat + parkind_${prec} + trans_${prec} + ) + target_compile_definitions(${program}-cpu-${prec} PRIVATE VERSION="cpu") + endif() + endforeach( prec) endif() - ecbuild_warn_var( ECTRANS_TOOLS_RTABLE_PATH ) - - file( GLOB ectrans_programs *.F90 ) - foreach( _program IN ITEMS ${ectrans_programs} ) - get_filename_component( _program ${_program} NAME_WE ) - - ecbuild_add_executable(TARGET ${_program} - SOURCES ${_program}.F90 - LIBS ${trans} - LINKER_LANGUAGE Fortran - DEFINITIONS ECTRANS_TOOLS_RTABLE_PATH="${ECTRANS_TOOLS_RTABLE_PATH}" ) - - endforeach() - -endif() - -foreach( prec sp dp ) - if( HAVE_${prec} ) - ecbuild_add_executable(TARGET ectrans-benchmark-${prec} - SOURCES ectrans-benchmark.F90 - LINKER_LANGUAGE Fortran - LIBS - fiat - parkind_${prec} - trans_${prec} - ) - endif() -endforeach() + if( HAVE_GPU ) + foreach( prec dp sp ) + if( HAVE_${prec} ) + ecbuild_add_executable( TARGET ${program}-gpu-${prec} + SOURCES ${program}.F90 + LINKER_LANGUAGE Fortran + LIBS + fiat + parkind_${prec} + trans_gpu_${prec} + OpenACC::OpenACC_Fortran + ) + target_compile_definitions(${program}-gpu-${prec} PRIVATE VERSION="gpu") + endif() + endforeach( prec ) + endif( HAVE_GPU ) +endforeach( program ) # ectrans information tool @@ -65,4 +65,3 @@ install( FILES PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) - diff --git a/src/programs/ectrans-benchmark-ifs.F90 b/src/programs/ectrans-benchmark-ifs.F90 new file mode 100644 index 00000000..fd9fcc8d --- /dev/null +++ b/src/programs/ectrans-benchmark-ifs.F90 @@ -0,0 +1,1506 @@ +! (C) Copyright 2014- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +program transform_test + +! +! Spectral transform test +! +! This test performs spectral to real and real to spectral transforms repeated in +! timed loop. +! +! 1) One "surface" field is always transformed: +! zspsc2(1,1:nspec2) <-> zgmvs(1:nproma,1:1,1:ngbplk) +! +! 2) A Multiple "3d" fields are transformed and can be disabled with "--nfld 0" +! +! zspsc3a(1:nlev,1:nspec2,1:nfld) <-> zgp3a(1:nproma,1:nlev,1:nfld,1:ngpblk) +! +! 3) Optionally a "3d" vorticity/divergence field is transformed to uv (wind) and +! can be enabled with "--vordiv" +! +! zspvor(1:nlev,1:nspec2) / zspdiv(1:nlev,1:nspec2) <-> zgpuv(1:nproma,1:nlev,1:2,1:ngpblk) +! +! 4) Optionally scalar derivatives can be computed for the fields described in 1) and 2) +! This must be enabled with "--scders" +! +! 5) Optionally uv East-West derivate can be computed from vorticity/divergence. +! This must be enabled with "--vordiv --uvders" +! +! +! Authors : George Mozdzynski +! Willem Deconinck +! Ioan Hadade +! Sam Hatfield +! + +use parkind1, only: jpim, jprb, jprd +use oml_mod ,only : oml_max_threads +use mpl_module +use yomgstats, only: jpmaxstat +use yomhook, only : dr_hook_init + +implicit none + +! Number of points in top/bottom latitudes +integer(kind=jpim), parameter :: min_octa_points = 20 + +integer(kind=jpim) :: istack, getstackusage +real(kind=jprb), dimension(1) :: zmaxerr(5), zerr(5) +real(kind=jprb) :: zmaxerrg + +! Output unit numbers +integer(kind=jpim), parameter :: nerr = 0 ! Unit number for STDERR +integer(kind=jpim), parameter :: nout = 6 ! Unit number for STDOUT +integer(kind=jpim), parameter :: noutdump = 7 ! Unit number for field output + +! Default parameters +integer(kind=jpim) :: nsmax = 79 ! Spectral truncation +integer(kind=jpim) :: iters = 10 ! Number of iterations for transform test +integer(kind=jpim) :: nfld = 1 ! Number of scalar fields +integer(kind=jpim) :: nlev = 1 ! Number of vertical levels + +integer(kind=jpim) :: nflevg +integer(kind=jpim) :: ndgl ! Number of latitudes +integer(kind=jpim) :: nspec2 +integer(kind=jpim) :: ngptot +integer(kind=jpim) :: ngptotg +integer(kind=jpim) :: ifld +integer(kind=jpim) :: jroc +integer(kind=jpim) :: jb +integer(kind=jpim) :: nspec2g +integer(kind=jpim) :: i +integer(kind=jpim) :: ja +integer(kind=jpim) :: ib +integer(kind=jpim) :: jprtrv + +integer(kind=jpim), allocatable :: nloen(:), nprcids(:) +integer(kind=jpim) :: myproc, jj, jf, ilf +integer :: jstep + +real(kind=jprd) :: ztinit, ztloop, timef, ztstepmax, ztstepmin, ztstepavg, ztstepmed +real(kind=jprd) :: ztstepmax1, ztstepmin1, ztstepavg1, ztstepmed1 +real(kind=jprd) :: ztstepmax2, ztstepmin2, ztstepavg2, ztstepmed2 +real(kind=jprd), allocatable :: ztstep(:), ztstep1(:), ztstep2(:) + +real(kind=jprb), allocatable :: znormsp(:), znormsp1(:), znormdiv(:), znormdiv1(:) +real(kind=jprb), allocatable :: znormvor(:), znormvor1(:), znormt(:), znormt1(:) +real(kind=jprd) :: zaveave(0:jpmaxstat) + +! Grid-point space data structures +real(kind=jprb), allocatable, target :: zgmv (:,:,:,:) ! Multilevel fields at t and t-dt +real(kind=jprb), allocatable, target :: zgmvs (:,:,:) ! Single level fields at t and t-dt +real(kind=jprb), pointer :: zgp3a (:,:,:,:) ! Multilevel fields at t and t-dt +real(kind=jprb), pointer :: zgpuv (:,:,:,:) ! Multilevel fields at t and t-dt +real(kind=jprb), pointer :: zgp2 (:,:,:) ! Single level fields at t and t-dt + +! Spectral space data structures +real(kind=jprb), allocatable, target :: sp3d(:,:,:) +real(kind=jprb), pointer :: zspvor(:,:) => null() +real(kind=jprb), pointer :: zspdiv(:,:) => null() +real(kind=jprb), pointer :: zspsc3a(:,:,:) => null() +real(kind=jprb), allocatable :: zspsc2(:,:) +real(kind=jprb), allocatable :: zave(:),zmin(:),zmax(:),zreel(:,:,:) + +logical :: lstack = .false. ! Output stack info +logical :: luserpnm = .false. +logical :: lkeeprpnm = .false. +logical :: luseflt = .false. ! Use fast legendre transforms +logical :: ltrace_stats = .false. +logical :: lstats_omp = .false. +logical :: lstats_comms = .false. +logical :: lstats_mpl = .false. +logical :: lstats = .true. ! gstats statistics +logical :: lbarrier_stats = .false. +logical :: lbarrier_stats2 = .false. +logical :: ldetailed_stats = .false. +logical :: lstats_alloc = .false. +logical :: lsyncstats = .false. +logical :: lstatscpu = .false. +logical :: lstats_mem = .false. +logical :: lxml_stats = .false. +logical :: lfftw = .true. ! Use FFTW for Fourier transforms +logical :: lvordiv = .false. +logical :: lscders = .false. +logical :: luvders = .false. +logical :: lprint_norms = .false. ! Calculate and print spectral norms +logical :: lmeminfo = .false. ! Show information from FIAT routine ec_meminfo at the end +logical :: lgpnorms = .false. ! print gpnorms + +integer(kind=jpim) :: nstats_mem = 0 +integer(kind=jpim) :: ntrace_stats = 0 +integer(kind=jpim) :: nprnt_stats = 1 + +! The multiplier of the machine epsilon used as a tolerance for correctness checking +! ncheck = 0 (the default) means that correctness checking is disabled +integer(kind=jpim) :: ncheck = 0 + +logical :: lmpoff = .false. ! Message passing switch + +! Verbosity level (0 or 1) +integer :: verbosity = 0 + +real(kind=jprb) :: zra = 6371229._jprb + +integer(kind=jpim) :: nmax_resol = 37 ! Max number of resolutions +integer(kind=jpim) :: npromatr = 0 ! nproma for trans lib +integer(kind=jpim) :: ncombflen = 1800000 ! Size of comm buffer + +integer(kind=jpim) :: nproc ! Number of procs +integer(kind=jpim) :: nthread +integer(kind=jpim) :: nprgpns ! Grid-point decomp +integer(kind=jpim) :: nprgpew ! Grid-point decomp +integer(kind=jpim) :: nprtrv = 0 ! Spectral decomp +integer(kind=jpim) :: nprtrw = 0 ! Spectral decomp +integer(kind=jpim) :: nspecresmin = 80 ! Minimum spectral resolution, for controlling nprtrw +integer(kind=jpim) :: mysetv +integer(kind=jpim) :: mysetw +integer(kind=jpim) :: mp_type = 2 ! Message passing type +integer(kind=jpim) :: mbx_size = 150000000 ! Mailbox size + +integer(kind=jpim), allocatable :: numll(:), ivset(:) +integer(kind=jpim) :: ivsetsc(1) + +integer(kind=jpim) :: nflevl + +! sumpini +integer(kind=jpim) :: isqr +logical :: lsync_trans = .true. ! Activate barrier sync +logical :: leq_regions = .true. ! Eq regions flag + + +integer(kind=jpim) :: nproma = 0 +integer(kind=jpim) :: ngpblks +! locals +integer(kind=jpim) :: iprtrv +integer(kind=jpim) :: iprtrw +integer(kind=jpim) :: iprused, ilevpp, irest, ilev, jlev, iprev + +integer(kind=jpim) :: ndimgmv = 0 ! Third dim. of gmv "(nproma,nflevg,ndimgmv,ngpblks)" +integer(kind=jpim) :: ndimgmvs = 0 ! Second dim. gmvs "(nproma,ndimgmvs,ngpblks)" + +integer(kind=jpim) :: jbegin_uv = 0 +integer(kind=jpim) :: jend_uv = 0 +integer(kind=jpim) :: jbegin_sc = 0 +integer(kind=jpim) :: jend_sc = 0 +integer(kind=jpim) :: jbegin_scder_NS = 0 +integer(kind=jpim) :: jend_scder_NS = 0 +integer(kind=jpim) :: jbegin_scder_EW = 0 +integer(kind=jpim) :: jend_scder_EW = 0 +integer(kind=jpim) :: jbegin_uder_EW = 0 +integer(kind=jpim) :: jend_uder_EW = 0 +integer(kind=jpim) :: jbegin_vder_EW = 0 +integer(kind=jpim) :: jend_vder_EW = 0 + +logical :: ldump_values = .false. + +integer, external :: ec_mpirank +logical :: luse_mpi = .true. + +character(len=16) :: cgrid = '' + +!=================================================================================================== + +#include "setup_trans0.h" +#include "setup_trans.h" +#include "inv_trans.h" +#include "dir_trans.h" +#include "trans_inq.h" +#include "specnorm.h" +#include "gpnorm_trans.h" +#include "abor1.intfb.h" +#include "gstats_setup.intfb.h" +#include "ec_meminfo.intfb.h" + +!=================================================================================================== + +luse_mpi = detect_mpirun() + +! Setup +call get_command_line_arguments(nsmax, cgrid, iters, nfld, nlev, lvordiv, lscders, luvders, lgpnorms, & + & luseflt, nproma, verbosity, ldump_values, lprint_norms, lmeminfo, nprtrv, nprtrw, ncheck) +if (cgrid == '') cgrid = cubic_octahedral_gaussian_grid(nsmax) +call parse_grid(cgrid, ndgl, nloen) +nflevg = nlev + +!=================================================================================================== + +if (luse_mpi) then + call mpl_init(ldinfo=(verbosity>=1)) + nproc = mpl_nproc() + myproc = mpl_myrank() +else + nproc = 1 + myproc = 1 + mpl_comm = -1 +endif +nthread = oml_max_threads() + +call dr_hook_init() + +!=================================================================================================== + +if( lstats ) call gstats(0,0) +ztinit = timef() + +! only output to stdout on pe 1 +if (nproc > 1) then + if (myproc /= 1) then + open(unit=nout, file='/dev/null') + endif +endif + +if (ldetailed_stats) then + lstats_omp = .true. + lstats_comms = .true. + lstats_mpl = .true. + lstatscpu = .true. + nprnt_stats = nproc +! lstats_mem = .true. +! lstats_alloc = .true. +endif + +!=================================================================================================== + +allocate(nprcids(nproc)) +do jj = 1, nproc + nprcids(jj) = jj +enddo + +if (nproc <= 1) then + lmpoff = .true. +endif + +! Compute nprgpns and nprgpew +! This version selects most square-like distribution +! These will change if leq_regions=.true. +if (nproc == 0) nproc = 1 +isqr = int(sqrt(real(nproc,jprb))) +do ja = isqr, nproc + ib = nproc/ja + if (ja*ib == nproc) then + nprgpns = max(ja,ib) + nprgpew = min(ja,ib) + exit + endif +enddo + +! From sumpini, although this should be specified in namelist +if (nspecresmin == 0) nspecresmin = nproc + +! Compute nprtrv and nprtrw if not provided on the command line +if (nprtrv > 0 .or. nprtrw > 0) then + if (nprtrv == 0) nprtrv = nproc/nprtrw + if (nprtrw == 0) nprtrw = nproc/nprtrv + if (nprtrw*nprtrv /= nproc) call abor1('transform_test:nprtrw*nprtrv /= nproc') + if (nprtrw > nspecresmin) call abor1('transform_test:nprtrw > nspecresmin') +else + do jprtrv = 4, nproc + nprtrv = jprtrv + nprtrw = nproc/nprtrv + if (nprtrv*nprtrw /= nproc) cycle + if (nprtrv > nprtrw) exit + if (nprtrw > nspecresmin) cycle + if (nprtrw <= nspecresmin/(2*oml_max_threads())) exit + enddo + ! Go for approx square partition for backup + if (nprtrv*nprtrw /= nproc .or. nprtrw > nspecresmin .or. nprtrv > nprtrw) then + isqr = int(sqrt(real(nproc,jprb))) + do ja = isqr, nproc + ib = nproc/ja + if (ja*ib == nproc) then + nprtrw = max(ja, ib) + nprtrv = min(ja, ib) + if (nprtrw > nspecresmin ) then + call abor1('transform_test:nprtrw (approx square value) > nspecresmin') + endif + exit + endif + enddo + endif +endif + +! Create communicators for mpi groups +if (.not.lmpoff) then + call mpl_groups_create(nprtrw, nprtrv) +endif + +if (lmpoff) then + mysetw = (myproc - 1)/nprtrv + 1 + mysetv = mod(myproc - 1, nprtrv) + 1 +else + call mpl_cart_coords(myproc, mysetw, mysetv) + + ! Just checking for now... + iprtrv = mod(myproc - 1, nprtrv) + 1 + iprtrw = (myproc - 1)/nprtrv + 1 + if (iprtrv /= mysetv .or. iprtrw /= mysetw) then + call abor1('transform_test:inconsistency when computing mysetw and mysetv') + endif +endif + +if (.not. lmpoff) then + call mpl_buffer_method(kmp_type=mp_type, kmbx_size=mbx_size, kprocids=nprcids, ldinfo=(verbosity>=1)) +endif + +! Determine number of local levels for fourier and legendre calculations +! based on the values of nflevg and nprtrv +allocate(numll(nprtrv+1)) + +! Calculate remainder +iprused = min(nflevg+1, nprtrv) +ilevpp = nflevg/nprtrv +irest = nflevg -ilevpp*nprtrv +do jroc = 1, nprtrv + if (jroc <= irest) then + numll(jroc) = ilevpp+1 + else + numll(jroc) = ilevpp + endif +enddo +numll(iprused+1:nprtrv+1) = 0 + +nflevl = numll(mysetv) + +ivsetsc(1) = iprused +ifld = 0 + +!=================================================================================================== +! Setup gstats +!=================================================================================================== + +if (lstats) then + call gstats_setup(nproc, myproc, nprcids, & + & lstats, lstatscpu, lsyncstats, ldetailed_stats, lbarrier_stats, lbarrier_stats2, & + & lstats_omp, lstats_comms, lstats_mem, nstats_mem, lstats_alloc, & + & ltrace_stats, ntrace_stats, nprnt_stats, lxml_stats) + call gstats_psut + + ! Assign labels to GSTATS regions + call gstats_labels +endif + +!=================================================================================================== +! Call ecTrans setup routines +!=================================================================================================== + +if (verbosity >= 1) write(nout,'(a)')'======= Setup ecTrans =======' + +call gstats(1, 0) +call setup_trans0(kout=nout, kerr=nerr, kprintlev=merge(2, 0, verbosity == 1), & + & kmax_resol=nmax_resol, kpromatr=npromatr, kprgpns=nprgpns, kprgpew=nprgpew, & + & kprtrw=nprtrw, kcombflen=ncombflen, ldsync_trans=lsync_trans, & + & ldeq_regions=leq_regions, prad=zra, ldalloperm=.true., ldmpoff=.not.luse_mpi) +call gstats(1, 1) + +call gstats(2, 0) +! IFS spectral fields are dimensioned NFLEVL, Nils !! +call set_ectrans_gpu_nflev(nflevl) + ! We pass nflevl via environment variable in order not to change API + ! In long run, ectrans should grow its internal buffers automatically +call setup_trans(ksmax=nsmax, kdgl=ndgl, kloen=nloen, ldsplit=.true., & + & ldusefftw=lfftw, lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & + & lduseflt=luseflt) +call gstats(2, 1) + +call trans_inq(kspec2=nspec2, kspec2g=nspec2g, kgptot=ngptot, kgptotg=ngptotg) + +if (nproma == 0) then ! no blocking (default when not specified) + nproma = ngptot +endif + +! Calculate number of NPROMA blocks +ngpblks = (ngptot - 1)/nproma+1 + +!=================================================================================================== +! Print information before starting +!=================================================================================================== + +! Print configuration details +if (verbosity >= 0) then + write(nout,'(" ")') + write(nout,'(a)')'======= Start of runtime parameters =======' + write(nout,'(" ")') + write(nout,'("nsmax ",i0)') nsmax + write(nout,'("grid ",a)') trim(cgrid) + write(nout,'("ndgl ",i0)') ndgl + write(nout,'("nproc ",i0)') nproc + write(nout,'("nthread ",i0)') nthread + write(nout,'("nprgpns ",i0)') nprgpns + write(nout,'("nprgpew ",i0)') nprgpew + write(nout,'("nprtrw ",i0)') nprtrw + write(nout,'("nprtrv ",i0)') nprtrv + write(nout,'("ngptot ",i0)') ngptot + write(nout,'("ngptotg ",i0)') ngptotg + write(nout,'("nfld ",i0)') nfld + write(nout,'("nlev ",i0)') nlev + write(nout,'("nproma ",i0)') nproma + write(nout,'("ngpblks ",i0)') ngpblks + write(nout,'("nspec2 ",i0)') nspec2 + write(nout,'("nspec2g ",i0)') nspec2g + write(nout,'("luseflt ",l)') luseflt + write(nout,'("lvordiv ",l)') lvordiv + write(nout,'("lscders ",l)') lscders + write(nout,'("luvders ",l)') luvders + write(nout,'("lgpnorms ",l)') lgpnorms + write(nout,'(" ")') + write(nout,'(a)') '======= End of runtime parameters =======' + write(nout,'(" ")') +end if + +!=================================================================================================== +! Allocate and Initialize spectral arrays +!=================================================================================================== + +! Allocate spectral arrays +! Try to mimick IFS layout as much as possible +nullify(zspvor) +nullify(zspdiv) +nullify(zspsc3a) +allocate(sp3d(nflevl,nspec2,2+nfld)) +allocate(zspsc2(1,nspec2)) + +call initialize_spectral_arrays(nsmax, zspsc2, sp3d) + +! Point convenience variables to storage variable sp3d +zspvor => sp3d(:,:,1) +zspdiv => sp3d(:,:,2) +zspsc3a => sp3d(:,:,3:3+(nfld-1)) + +!=================================================================================================== +! Allocate gridpoint arrays +!=================================================================================================== + +allocate(ivset(nflevg)) + +! Compute spectral distribution +ilev = 0 +do jb = 1, nprtrv + do jlev=1, numll(jb) + ilev = ilev + 1 + ivset(ilev) = jb + enddo +enddo + +! Allocate grid-point arrays +if (lvordiv) then + jbegin_uv = 1 + jend_uv = 2 +endif +if (luvders) then + jbegin_uder_EW = jend_uv + 1 + jend_uder_EW = jbegin_uder_EW + 1 + jbegin_vder_EW = jend_uder_EW + 1 + jend_vder_EW = jbegin_vder_EW + 1 +else + jbegin_uder_EW = jend_uv + jend_uder_EW = jend_uv + jbegin_vder_EW = jend_uv + jend_vder_EW = jend_uv +endif + +jbegin_sc = jend_vder_EW + 1 +jend_sc = jend_vder_EW + nfld + +if (lscders) then + ndimgmvs = 3 + jbegin_scder_NS = jend_sc + 1 + jend_scder_NS = jend_sc + nfld + jbegin_scder_EW = jend_scder_NS + 1 + jend_scder_EW = jend_scder_NS + nfld +else + ndimgmvs = 1 + jbegin_scder_NS = jend_sc + jend_scder_NS = jend_sc + jbegin_scder_EW = jend_sc + jend_scder_EW = jend_sc +endif + +ndimgmv = jend_scder_EW + +allocate(zgmv(nproma,nflevg,ndimgmv,ngpblks)) +allocate(zgmvs(nproma,ndimgmvs,ngpblks)) + +zgpuv => zgmv(:,:,1:jend_vder_EW,:) +zgp3a => zgmv(:,:,jbegin_sc:jend_scder_EW,:) +zgp2 => zgmvs(:,:,:) + +!=================================================================================================== +! Allocate norm arrays +!=================================================================================================== + +if (lprint_norms .or. ncheck > 0) then + allocate(znormsp(1)) + allocate(znormsp1(1)) + allocate(znormvor(nflevg)) + allocate(znormvor1(nflevg)) + allocate(znormdiv(nflevg)) + allocate(znormdiv1(nflevg)) + allocate(znormt(nflevg)) + allocate(znormt1(nflevg)) + + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor1, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv1, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt1, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp1, kvset=ivsetsc) + + if (verbosity >= 1) then + do ifld = 1, nflevg + write(nout,'("norm zspvor( ",i4,",:) = ",f20.15)') ifld, znormvor1(ifld) + enddo + do ifld = 1, nflevg + write(nout,'("norm zspdiv( ",i4,",:) = ",f20.15)') ifld, znormdiv1(ifld) + enddo + do ifld = 1, nflevg + write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15)') ifld, znormt1(ifld) + enddo + do ifld = 1, 1 + write(nout,'("norm zspsc2( ",i4,",:) = ",f20.15)') ifld, znormsp1(ifld) + enddo + endif +endif + +!=================================================================================================== +! Setup timers +!=================================================================================================== + +ztinit = (timef() - ztinit)/1000.0_jprd + +if (verbosity >= 0) then + write(nout,'(" ")') + write(nout,'(a,i6,a,f9.2,a)') "transform_test initialisation, on",nproc,& + & " tasks, took",ztinit," sec" + write(nout,'(" ")') +endif + +if (iters <= 0) call abor1('transform_test:iters <= 0') + +allocate(ztstep(iters)) +allocate(ztstep1(iters)) +allocate(ztstep2(iters)) + +ztstepavg = 0._jprd +ztstepmax = 0._jprd +ztstepmin = 9999999999999999._jprd +ztstepavg1 = 0._jprd +ztstepmax1 = 0._jprd +ztstepmin1 = 9999999999999999._jprd +ztstepavg2 = 0._jprd +ztstepmax2 = 0._jprd +ztstepmin2 = 9999999999999999._jprd + +write(nout,'(a)') '======= Start of spectral transforms =======' +write(nout,'(" ")') + +ztloop = timef() + +!=================================================================================================== +! Do spectral transform loop +!=================================================================================================== + +do jstep = 1, iters + call gstats(3,0) + ztstep(jstep) = timef() + + !================================================================================================= + ! Do inverse transform + !================================================================================================= + + ztstep1(jstep) = timef() + call gstats(4,0) + if (lvordiv) then + + ! test different paradigms with small trans first, single field + derivatives, emulating sporog trans in IFS + write(nout,*) 'Test sporog like single transform ...' + call flush(nout) + ! special case when single transform, reset later + iprev = ivsetsc(1) + ivsetsc(1) = nprtrv + ilf = 0 + if(nprtrv == mysetv) then + ilf = 1 + endif + allocate(zreel(nproma,3,ngpblks)) + zreel(:,:,:)=0._jprb + call inv_trans(kresol=1, kproma=nproma, & + & pspscalar=zspsc2(1:ilf,:), & ! spectral scalar + & ldscders=.true., & ! scalar derivatives + & kvsetsc=ivsetsc, & + & pgp=zreel) + + if( lgpnorms ) then + ! reset prev value + ivsetsc(1) = iprev + write(nout,*) 'statistics gpnorm_trans ...' + call flush(nout) + ifld=3 + allocate(zave(ifld)) + allocate(zmin(ifld)) + allocate(zmax(ifld)) + call gpnorm_trans(zreel,ifld,nproma,zave,zmin,zmax,.false.,kresol=1) + do jf=1,ifld + write(nout,*) '1st Statistics field= ',jf,' : ave,min,max ',zave(jf),zmin(jf),zmax(jf) + call flush(nout) + enddo + deallocate(zave) + deallocate(zmin) + deallocate(zmax) + endif + deallocate(zreel) + + write(nout,*) 'standard time-step ...' + call flush(nout) + zgpuv(:,:,:,:) = 0._JPRB +! full time step + call inv_trans(kresol=1, kproma=nproma, & + & pspsc2=zspsc2, & ! spectral surface pressure + & pspvor=zspvor, & ! spectral vorticity + & pspdiv=zspdiv, & ! spectral divergence + & pspsc3a=zspsc3a, & ! spectral scalars + & ldscders=lscders, & + & ldvorgp=.false., & ! no gridpoint vorticity + & lddivgp=.false., & ! no gridpoint divergence + & lduvder=luvders, & + & kvsetuv=ivset, & + & kvsetsc2=ivsetsc, & + & kvsetsc3a=ivset, & + & pgp2=zgp2, & + & pgpuv=zgpuv, & + & pgp3a=zgp3a) + + if( lgpnorms ) then + write(nout,*) 'statistics gpnorm_trans all levels ...' + call flush(nout) + allocate(zave(nflevg)) + allocate(zmin(nflevg)) + allocate(zmax(nflevg)) + ! vorticity only, all levels + ifld=1 + call gpnorm_trans(zgpuv(:,1:nflevg,ifld,:),nflevg,nproma,zave,zmin,zmax,.false.,1) + do jf=1,nflevg + write(nout,*) 'Statistics vorticity level= ',jf,' : ave,min,max ',zave(jf),zmin(jf),zmax(jf) + call flush(nout) + enddo + call gpnorm_trans(zgp3a(:,1:nflevg,ifld,:),nflevg,nproma,zave,zmin,zmax,.false.,1) + do jf=1,nflevg + write(nout,*) 'Statistics scalar level= ',jf,' : ave,min,max ',zave(jf),zmin(jf),zmax(jf) + call flush(nout) + enddo + call gpnorm_trans(zgp3a(:,1:nflevg,ifld+nfld,:),nflevg,nproma,zave,zmin,zmax,.false.,1) + do jf=1,nflevg + write(nout,*) 'Statistics scalar x-der level= ',jf,' : ave,min,max ',zave(jf),zmin(jf),zmax(jf) + call flush(nout) + enddo + call gpnorm_trans(zgp3a(:,1:nflevg,ifld+2*nfld,:),nflevg,nproma,zave,zmin,zmax,.false.,1) + do jf=1,nflevg + write(nout,*) 'Statistics scalar y-der level= ',jf,' : ave,min,max ',zave(jf),zmin(jf),zmax(jf) + call flush(nout) + enddo + deallocate(zave) + deallocate(zmin) + deallocate(zmax) + endif + + ! test different paradigms with small trans first, single field + derivatives, emulating sporog trans in IFS + write(nout,*) 'Test sporog like single transform ...' + call flush(nout) + ! special case when single transform, reset later + iprev = ivsetsc(1) + ivsetsc(1) = nprtrv + ilf = 0 + if(nprtrv == mysetv) then + ilf = 1 + endif + allocate(zreel(nproma,3,ngpblks)) + zreel(:,:,:)=0._jprb + call inv_trans(kresol=1, kproma=nproma, & + & pspscalar=zspsc2(1:ilf,:), & ! spectral scalar + & ldscders=.true., & ! scalar derivatives + & kvsetsc=ivsetsc, & + & pgp=zreel) + + if( lgpnorms ) then + ! reset prev value + ivsetsc(1) = iprev + write(nout,*) 'statistics gpnorm_trans ...' + call flush(nout) + ifld=3 + allocate(zave(ifld)) + allocate(zmin(ifld)) + allocate(zmax(ifld)) + call gpnorm_trans(zreel,ifld,nproma,zave,zmin,zmax,.false.,kresol=1) + do jf=1,ifld + write(nout,*) '2nd Statistics field= ',jf,' : ave,min,max ',zave(jf),zmin(jf),zmax(jf) + call flush(nout) + enddo + deallocate(zave) + deallocate(zmin) + deallocate(zmax) + endif + deallocate(zreel) + + else + call inv_trans(kresol=1, kproma=nproma, & + & pspsc2=zspsc2, & ! spectral surface pressure + & pspsc3a=zspsc3a, & ! spectral scalars + & ldscders=lscders, & ! scalar derivatives + & kvsetsc2=ivsetsc, & + & kvsetsc3a=ivset, & + & pgp2=zgp2, & + & pgp3a=zgp3a) + endif + call gstats(4,1) + + ztstep1(jstep) = (timef() - ztstep1(jstep))/1000.0_jprd + + !================================================================================================= + ! While in grid point space, dump the values to disk, for debugging only + !================================================================================================= + + if (ldump_values) then + ! dump a field to a binary file + call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgp2(:,1,:), 'S', noutdump) + call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgpuv(:,nflevg,1,:), 'U', noutdump) + call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgpuv(:,nflevg,2,:), 'V', noutdump) + call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgp3a(:,nflevg,1,:), 'T', noutdump) + endif + + !================================================================================================= + ! Do direct transform + !================================================================================================= + + ztstep2(jstep) = timef() + + call gstats(5,0) + if (lvordiv) then + call dir_trans(kresol=1, kproma=nproma, & + & pgp2=zgmvs(:,1:1,:), & + & pgpuv=zgpuv(:,:,1:2,:), & + & pgp3a=zgp3a(:,:,1:nfld,:), & + & pspvor=zspvor, & + & pspdiv=zspdiv, & + & pspsc2=zspsc2, & + & pspsc3a=zspsc3a, & + & kvsetuv=ivset, & + & kvsetsc2=ivsetsc, & + & kvsetsc3a=ivset) + else + call dir_trans(kresol=1, kproma=nproma, & + & pgp=zgp3a(:,1,1:nfld,:), & + & pspscalar=zspsc3a(1:1,1:nfld,1), & ! spectral scalar + & kvsetsc=ivset) +! call dir_trans(kresol=1, kproma=nproma, & +! & pgp2=zgmvs(:,1:1,:), & +! & pgp3a=zgp3a(:,:,1:nfld,:), & +! & pspsc2=zspsc2, & +! & pspsc3a=zspsc3a, & +! & kvsetsc2=ivsetsc, & +! & kvsetsc3a=ivset) + endif + call gstats(5,1) + ztstep2(jstep) = (timef() - ztstep2(jstep))/1000.0_jprd + + !================================================================================================= + ! Calculate timings + !================================================================================================= + + ztstep(jstep) = (timef() - ztstep(jstep))/1000.0_jprd + + ztstepavg = ztstepavg + ztstep(jstep) + ztstepmin = min(ztstep(jstep), ztstepmin) + ztstepmax = max(ztstep(jstep), ztstepmax) + + ztstepavg1 = ztstepavg1 + ztstep1(jstep) + ztstepmin1 = min(ztstep1(jstep), ztstepmin1) + ztstepmax1 = max(ztstep1(jstep), ztstepmax1) + + ztstepavg2 = ztstepavg2 + ztstep2(jstep) + ztstepmin2 = min(ztstep2(jstep), ztstepmin2) + ztstepmax2 = max(ztstep2(jstep), ztstepmax2) + + !================================================================================================= + ! Print norms + !================================================================================================= + + if (lprint_norms) then + call gstats(6,0) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc(1:1)) + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset(1:nflevg)) + + ! Surface pressure + zmaxerr(:) = -999.0 + do ifld = 1, 1 + zerr(1) = abs(znormsp1(ifld)/znormsp(ifld) - 1.0_jprb) + zmaxerr(1) = max(zmaxerr(1), zerr(1)) + enddo + ! Divergence + do ifld = 1, nflevg + zerr(2) = abs(znormdiv1(ifld)/znormdiv(ifld) - 1.0_jprb) + zmaxerr(2) = max(zmaxerr(2), zerr(2)) + enddo + ! Vorticity + do ifld = 1, nflevg + zerr(3) = abs(znormvor1(ifld)/znormvor(ifld) - 1.0_jprb) + zmaxerr(3) = max(zmaxerr(3),zerr(3)) + enddo + ! Temperature + do ifld = 1, nflevg + zerr(4) = abs(znormt1(ifld)/znormt(ifld) - 1.0_jprb) + zmaxerr(4) = max(zmaxerr(4), zerr(4)) + enddo + write(nout,'("time step ",i6," took", f8.4," | zspvor max err="e10.3,& + & " | zspdiv max err="e10.3," | zspsc3a max err="e10.3," | zspsc2 max err="e10.3)') & + & jstep, ztstep(jstep), zmaxerr(3), zmaxerr(2), zmaxerr(4), zmaxerr(1) + call gstats(6,1) + else + write(nout,'("Time step ",i6," took", f8.4)') jstep, ztstep(jstep) + endif + call gstats(3,1) +enddo + +!=================================================================================================== + +ztloop = (timef() - ztloop)/1000.0_jprd + +write(nout,'(" ")') +write(nout,'(a)') '======= End of spectral transforms =======' +write(nout,'(" ")') + +if (lprint_norms .or. ncheck > 0) then + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset) + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc) + + zmaxerr(:) = -999.0 + do ifld = 1, nflevg + zerr(3) = abs(real(znormvor1(ifld),kind=jprd)/real(znormvor(ifld),kind=jprd) - 1.0_jprd) + zmaxerr(3) = max(zmaxerr(3), zerr(3)) + if (verbosity >= 1) then + write(nout,'("norm zspvor( ",i4,") = ",f20.15," error = ",e10.3)') ifld, znormvor1(ifld), zerr(3) + endif + enddo + do ifld = 1, nflevg + zerr(2) = abs(real(znormdiv1(ifld),kind=jprd)/real(znormdiv(ifld),kind=jprd) - 1.0d0) + zmaxerr(2) = max(zmaxerr(2),zerr(2)) + if (verbosity >= 1) then + write(nout,'("norm zspdiv( ",i4,",:) = ",f20.15," error = ",e10.3)') ifld, znormdiv1(ifld), zerr(2) + endif + enddo + do ifld = 1, nflevg + zerr(4) = abs(real(znormt1(ifld),kind=jprd)/real(znormt(ifld),kind=jprd) - 1.0d0) + zmaxerr(4) = max(zmaxerr(4), zerr(4)) + if (verbosity >= 1) then + write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15," error = ",e10.3)') ifld, znormt1(ifld), zerr(4) + endif + enddo + do ifld = 1, 1 + zerr(1) = abs(real(znormsp1(ifld),kind=jprd)/real(znormsp(ifld),kind=jprd) - 1.0d0) + zmaxerr(1) = max(zmaxerr(1), zerr(1)) + if (verbosity >= 1) then + write(nout,'("norm zspsc2( ",i4,",:) = ",f20.15," error = ",e10.3)') ifld, znormsp1(ifld), zerr(1) + endif + enddo + + ! maximum error across all fields + zmaxerrg = max(max(zmaxerr(1),zmaxerr(2)), max(zmaxerr(2), zmaxerr(3))) + + if (verbosity >= 1) write(nout,*) + write(nout,'("max error zspvor(1:nlev,:) = ",e10.3)') zmaxerr(3) + write(nout,'("max error zspdiv(1:nlev,:) = ",e10.3)') zmaxerr(2) + write(nout,'("max error zspsc3a(1:nlev,:,1) = ",e10.3)') zmaxerr(4) + write(nout,'("max error zspsc2(1:1,:) = ",e10.3)') zmaxerr(1) + write(nout,*) + write(nout,'("max error combined = = ",e10.3)') zmaxerrg + write(nout,*) + + if (ncheck > 0 .and. myproc == 1) then + ! If the maximum spectral norm error across all fields is greater than 100 times the machine + ! epsilon, fail the test + if (zmaxerrg > real(ncheck, jprb) * epsilon(1.0_jprb)) then + write(nout, '(a)') '*******************************' + write(nout, '(a)') 'Correctness test failed' + write(nout, '(a,1e7.2)') 'Maximum spectral norm error = ', zmaxerrg + write(nout, '(a,1e7.2)') 'Error tolerance = ', real(ncheck, jprb) * epsilon(1.0_jprb) + write(nout, '(a)') '*******************************' + error stop + endif + endif +endif + +if (luse_mpi) then + call mpl_allreduce(ztloop, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstep, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepavg, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepmax, 'max', ldreprod=.false.) + call mpl_allreduce(ztstepmin, 'min', ldreprod=.false.) + + call mpl_allreduce(ztstep1, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepavg1, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepmax1, 'max', ldreprod=.false.) + call mpl_allreduce(ztstepmin1, 'min', ldreprod=.false.) + + call mpl_allreduce(ztstep2, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepavg2, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepmax2, 'max', ldreprod=.false.) + call mpl_allreduce(ztstepmin2, 'min', ldreprod=.false.) +endif + +ztstepavg = (ztstepavg/real(nproc,jprb))/real(iters,jprd) +ztloop = ztloop/real(nproc,jprd) +ztstep(:) = ztstep(:)/real(nproc,jprd) + +call sort(ztstep,iters) +ztstepmed = ztstep(iters/2) + +ztstepavg1 = (ztstepavg1/real(nproc,jprb))/real(iters,jprd) +ztstep1(:) = ztstep1(:)/real(nproc,jprd) + +call sort(ztstep1, iters) +ztstepmed1 = ztstep1(iters/2) + +ztstepavg2 = (ztstepavg2/real(nproc,jprb))/real(iters,jprd) +ztstep2(:) = ztstep2(:)/real(nproc,jprd) + +call sort(ztstep2,iters) +ztstepmed2 = ztstep2(iters/2) + +write(nout,'(a)') '======= Start of time step stats =======' +write(nout,'(" ")') +write(nout,'("Inverse transforms")') +write(nout,'("------------------")') +write(nout,'("avg (s): ",f8.4)') ztstepavg1 +write(nout,'("min (s): ",f8.4)') ztstepmin1 +write(nout,'("max (s): ",f8.4)') ztstepmax1 +write(nout,'("med (s): ",f8.4)') ztstepmed1 +write(nout,'(" ")') +write(nout,'("Direct transforms")') +write(nout,'("-----------------")') +write(nout,'("avg (s): ",f8.4)') ztstepavg2 +write(nout,'("min (s): ",f8.4)') ztstepmin2 +write(nout,'("max (s): ",f8.4)') ztstepmax2 +write(nout,'("med (s): ",f8.4)') ztstepmed2 +write(nout,'(" ")') +write(nout,'("Inverse-direct transforms")') +write(nout,'("-------------------------")') +write(nout,'("avg (s): ",f8.4)') ztstepavg +write(nout,'("min (s): ",f8.4)') ztstepmin +write(nout,'("max (s): ",f8.4)') ztstepmax +write(nout,'("med (s): ",f8.4)') ztstepmed +write(nout,'("loop (s): ",f8.4)') ztloop +write(nout,'(" ")') +write(nout,'(a)') '======= End of time step stats =======' +write(nout,'(" ")') + +if (lstack) then + ! Gather stack usage statistics + istack = getstackusage() + if (myproc == 1) then + print 9000, istack + 9000 format("Stack utilisation information",/,& + &"=============================",//,& + &"Task size(bytes)",/,& + &"==== ===========",//,& + &" 1",11x,i10) + + do i = 2, nproc + call mpl_recv(istack, ksource=nprcids(i), ktag=i, cdstring='transform_test:') + print '(i4,11x,i10)', i, istack + enddo + else + call mpl_send(istack, kdest=nprcids(1), ktag=myproc, cdstring='transform_test:') + endif +endif + + +!=================================================================================================== +! Cleanup +!=================================================================================================== + +deallocate(zgmv) +deallocate(zgmvs) + +!=================================================================================================== + +if (lstats) then + call gstats(0,1) + call gstats_print(nout, zaveave, jpmaxstat) +endif + +if (lmeminfo) then + write(nout,*) + call ec_meminfo(nout, "", mpl_comm, kbarr=1, kiotask=-1, & + & kcall=1) +endif + +!=================================================================================================== +! Finalize MPI +!=================================================================================================== + +if (luse_mpi) then + call mpl_end(ldmeminfo=.false.) +endif + +!=================================================================================================== +! Close file +!=================================================================================================== + +if (nproc > 1) then + if (myproc /= 1) then + close(unit=nout) + endif +endif + +!=================================================================================================== + +contains + +!=================================================================================================== + +subroutine parse_grid(cgrid,ndgl,nloen) + + character(len=*) :: cgrid + integer, intent(inout) :: ndgl + integer, intent(inout), allocatable :: nloen(:) + integer :: ios + integer :: gaussian_number + read(cgrid(2:len_trim(cgrid)),*,IOSTAT=ios) gaussian_number + if (ios==0) then + ndgl = 2 * gaussian_number + allocate(nloen(ndgl)) + if (cgrid(1:1) == 'F') then ! Regular Gaussian grid + nloen(:) = gaussian_number * 4 + return + endif + if (cgrid(1:1) == 'O') then ! Octahedral Gaussian grid + do i = 1, ndgl / 2 + nloen(i) = 20 + 4 * (i - 1) + nloen(ndgl - i + 1) = nloen(i) + end do + return + endif + endif + call parsing_failed("ERROR: Unsupported grid specified: "// trim(cgrid)) + +end subroutine + +!=================================================================================================== + +function get_int_value(cname, iarg) result(value) + + integer :: value + character(len=*), intent(in) :: cname + integer, intent(inout) :: iarg + character(len=128) :: carg + integer :: stat + + carg = get_str_value(cname, iarg) + call str2int(carg, value, stat) + + if (stat /= 0) then + call parsing_failed("Invalid argument for " // trim(cname) // ": " // trim(carg)) + end if + +end function + +!=================================================================================================== + +function get_str_value(cname, iarg) result(value) + + character(len=128) :: value + character(len=*), intent(in) :: cname + integer, intent(inout) :: iarg + + iarg = iarg + 1 + call get_command_argument(iarg, value) + + if (value == "") then + call parsing_failed("Invalid argument for " // trim(cname) // ": no value provided") + end if + +end function + +!=================================================================================================== + +subroutine parsing_failed(message) + + character(len=*), intent(in) :: message + if (luse_mpi) call mpl_init(ldinfo=.false.) + if (ec_mpirank() == 0) then + write(nerr,"(a)") trim(message) + call print_help(unit=nerr) + endif + if (luse_mpi) call mpl_end(ldmeminfo=.false.) + stop + +end subroutine + +!=================================================================================================== + +subroutine get_command_line_arguments(nsmax, cgrid, iters, nfld, nlev, lvordiv, lscders, luvders, lgpnorms, & + & luseflt, nproma, verbosity, ldump_values, lprint_norms, & + & lmeminfo, nprtrv, nprtrw, ncheck) + + integer, intent(inout) :: nsmax ! Spectral truncation + character(len=16), intent(inout) :: cgrid ! Spectral truncation + integer, intent(inout) :: iters ! Number of iterations for transform test + integer, intent(inout) :: nfld ! Number of scalar fields + integer, intent(inout) :: nlev ! Number of vertical levels + logical, intent(inout) :: lvordiv ! Also transform vorticity/divergence + logical, intent(inout) :: lscders ! Compute scalar derivatives + logical, intent(inout) :: luvders ! Compute uv East-West derivatives + logical, intent(inout) :: lgpnorms ! calculate/print gpnorms + logical, intent(inout) :: luseflt ! Use fast Legendre transforms + integer, intent(inout) :: nproma ! NPROMA + integer, intent(inout) :: verbosity ! Level of verbosity + logical, intent(inout) :: ldump_values ! Dump values of grid point fields for debugging + logical, intent(inout) :: lprint_norms ! Calculate and print spectral norms of fields + logical, intent(inout) :: lmeminfo ! Show information from FIAT ec_meminfo routine at the + ! end + integer, intent(inout) :: nprtrv ! Size of V set (spectral decomposition) + integer, intent(inout) :: nprtrw ! Size of W set (spectral decomposition) + integer, intent(inout) :: ncheck ! The multiplier of the machine epsilon used as a + ! tolerance for correctness checking + + character(len=128) :: carg ! Storage variable for command line arguments + integer :: iarg = 1 ! Argument index + integer :: stat ! For storing success status of string->integer conversion + integer :: myproc + + do while (iarg <= command_argument_count()) + call get_command_argument(iarg, carg) + + select case(carg) + ! Parse help argument + case('-h', '--help') + if (luse_mpi) call mpl_init(ldinfo=.false.) + if (ec_mpirank()==0) call print_help() + if (luse_mpi) call mpl_end(ldmeminfo=.false.) + stop + ! Parse verbosity argument + case('-v') + verbosity = 1 + ! Parse number of iterations argument + case('-n', '--niter') + iters = get_int_value('-n', iarg) + if (iters < 1) then + call parsing_failed("Invalid argument for -n: must be > 0") + end if + ! Parse spectral truncation argument + case('-t', '--truncation') + nsmax = get_int_value('-t', iarg) + if (nsmax < 1) then + call parsing_failed("Invalid argument for -t: must be > 0") + end if + case('-g', '--grid'); cgrid = get_str_value('-g', iarg) + case('-f', '--nfld'); nfld = get_int_value('-f', iarg) + case('-l', '--nlev'); nlev = get_int_value('-l', iarg) + case('--vordiv'); lvordiv = .True. + case('--scders'); lscders = .True. + case('--uvders'); luvders = .True. + case('--lgpnorms'); lgpnorms = .True. + case('--flt'); luseflt = .True. + case('--nproma'); nproma = get_int_value('--nproma', iarg) + case('--dump-values'); ldump_values = .true. + case('--norms'); lprint_norms = .true. + case('--meminfo'); lmeminfo = .true. + case('--nprtrv'); nprtrv = get_int_value('--nprtrv', iarg) + case('--nprtrw'); nprtrw = get_int_value('--nprtrw', iarg) + case('-c', '--check'); ncheck = get_int_value('-c', iarg) + case default + call parsing_failed("Unrecognised argument: " // trim(carg)) + + end select + iarg = iarg + 1 + end do + + if (.not. lvordiv) then + luvders = .false. + endif + +end subroutine get_command_line_arguments + +!=================================================================================================== + +function cubic_octahedral_gaussian_grid(nsmax) result(cgrid) + + character(len=16) :: cgrid + integer, intent(in) :: nsmax + write(cgrid,'(a,i0)') 'O',nsmax+1 + +end function + +!=================================================================================================== + +subroutine str2int(str, int, stat) + + character(len=*), intent(in) :: str + integer, intent(out) :: int + integer, intent(out) :: stat + read(str, *, iostat=stat) int + +end subroutine str2int + +!=================================================================================================== + +subroutine sort(a, n) + + real(kind=jprd), intent(inout) :: a(n) + integer(kind=jpim), intent(in) :: n + + real(kind=jprd) :: x + + integer :: i, j + + do i = 2, n + x = a(i) + j = i - 1 + do while (j >= 1) + if (a(j) <= x) exit + a(j + 1) = a(j) + j = j - 1 + end do + a(j + 1) = x + end do + +end subroutine sort + +!=================================================================================================== + +subroutine print_help(unit) + + integer, optional :: unit + integer :: nout = 6 + if (present(unit)) then + nout = unit + endif + + write(nout, "(a)") "" + + if (jprb == jprd) then + write(nout, "(a)") "NAME ectrans-benchmark-dp" + else + write(nout, "(a)") "NAME ectrans-benchmark-sp" + end if + write(nout, "(a)") "" + + write(nout, "(a)") "DESCRIPTION" + write(nout, "(a)") " This program tests ecTrans by transforming fields back and forth& + & between spectral " + if (jprb == jprd) then + write(nout, "(a)") " space and grid-point space (double-precision version)" + else + write(nout, "(a)") " space and grid-point space (single-precision version)" + end if + write(nout, "(a)") "" + + write(nout, "(a)") "USAGE" + if (jprb == jprd) then + write(nout, "(a)") " ectrans-benchmark-dp [options]" + else + write(nout, "(a)") " ectrans-benchmark-sp [options]" + end if + write(nout, "(a)") "" + + write(nout, "(a)") "OPTIONS" + write(nout, "(a)") " -h, --help Print this message" + write(nout, "(a)") " -v Run with verbose output" + write(nout, "(a)") " -t, --truncation T Run with this triangular spectral truncation& + & (default = 79)" + write(nout, "(a)") " -g, --grid GRID Run with this grid. Possible values: O, F" + write(nout, "(a)") " If not specified, O is used with N=truncation+1& + & (cubic relation)" + write(nout, "(a)") " -n, --niter NITER Run for this many inverse/direct transform& + & iterations (default = 10)" + write(nout, "(a)") " -f, --nfld NFLD Number of scalar fields (default = 1)" + write(nout, "(a)") " -l, --nlev NLEV Number of vertical levels (default = 1)" + write(nout, "(a)") " --vordiv Also transform vorticity-divergence to wind" + write(nout, "(a)") " --scders Compute scalar derivatives (default off)" + write(nout, "(a)") " --uvders Compute uv East-West derivatives (default off). Only& + & when also --vordiv is given" + write(nout, "(a)") " --flt Run with fast Legendre transforms (default off)" + write(nout, "(a)") " --nproma NPROMA Run with NPROMA (default no blocking: NPROMA=ngptot)" + write(nout, "(a)") " --norms Calculate and print spectral norms of transformed& + & fields" + write(nout, "(a)") " The computation of spectral norms will skew overall& + & timings" + write(nout, "(a)") " --meminfo Show diagnostic information from FIAT's ec_meminfo& + & subroutine on memory usage, thread-binding etc." + write(nout, "(a)") " --nprtrv Size of V set in spectral decomposition" + write(nout, "(a)") " --nprtrw Size of W set in spectral decomposition" + write(nout, "(a)") " -c, --check VALUE The multiplier of the machine epsilon used as a& + & tolerance for correctness checking" + write(nout, "(a)") "" + write(nout, "(a)") "DEBUGGING" + write(nout, "(a)") " --dump-values Output gridpoint fields in unformatted binary file" + write(nout, "(a)") "" + +end subroutine print_help + +!=================================================================================================== + +subroutine initialize_spectral_arrays(nsmax, zsp, sp3d) + + integer, intent(in) :: nsmax ! Spectral truncation + real(kind=jprb), intent(inout) :: zsp(:,:) ! Surface pressure + real(kind=jprb), intent(inout) :: sp3d(:,:,:) ! 3D fields + + integer(kind=jpim) :: nflevl + integer(kind=jpim) :: nfield + + integer :: i, j + + nflevl = size(sp3d, 1) + nfield = size(sp3d, 3) + + ! First initialize surface pressure + call initialize_2d_spectral_field(nsmax, zsp(1,:)) + + ! Then initialize all of the 3D fields + do i = 1, nflevl + do j = 1, nfield + call initialize_2d_spectral_field(nsmax, sp3d(i,:,j)) + end do + end do + +end subroutine initialize_spectral_arrays + +!=================================================================================================== + +subroutine initialize_2d_spectral_field(nsmax, field) + + integer, intent(in) :: nsmax ! Spectral truncation + real(kind=jprb), intent(inout) :: field(:) ! Field to initialize + + integer :: i, index, num_my_zon_wns + integer, allocatable :: my_zon_wns(:), nasm0(:) + + ! Choose a spherical harmonic to initialize arrays + integer :: m_num = 4 ! Zonal wavenumber + integer :: l_num = 19 ! Total wavenumber + + ! First initialise all spectral coefficients to zero + field(:) = 0.0 + + ! Get zonal wavenumbers this rank is responsible for + call trans_inq(knump=num_my_zon_wns) + allocate(my_zon_wns(num_my_zon_wns)) + call trans_inq(kmyms=my_zon_wns) + + ! If rank is responsible for the chosen zonal wavenumber... + if (any(my_zon_wns == m_num) ) then + ! Get array of spectral array addresses (this maps (m, n=m) to array index) + allocate(nasm0(0:nsmax)) + call trans_inq(kasm0=nasm0) + + ! Find out local array index of chosen spherical harmonic + index = nasm0(m_num) + 2 * (l_num - m_num) + 1 + + ! Set just that element to a constant value + field(index) = 1.0 + else + return + end if + +end subroutine initialize_2d_spectral_field + +!=================================================================================================== + +subroutine dump_gridpoint_field(jstep, myproc, nproma, ngpblks, fld, fldchar, noutdump) + + ! Dump a 2d field to a binary file. + + integer(kind=jpim), intent(in) :: jstep ! Time step, used for naming file + integer(kind=jpim), intent(in) :: myproc ! MPI rank, used for naming file + integer(kind=jpim), intent(in) :: nproma ! Size of nproma + integer(kind=jpim), intent(in) :: ngpblks ! Number of nproma blocks + real(kind=jprb) , intent(in) :: fld(nproma,ngpblks) ! 2D field + character , intent(in) :: fldchar ! Single character field identifier + integer(kind=jpim), intent(in) :: noutdump ! Tnit number for output file + + character(len=14) :: filename = "x.xxx.xxxx.dat" + + write(filename(1:1),'(a1)') fldchar + write(filename(3:5),'(i3.3)') jstep + write(filename(7:10),'(i4.4)') myproc + + open(noutdump, file=filename, form="unformatted") + write(noutdump) reshape(fld, (/ nproma*ngpblks /)) + close(noutdump) + +end subroutine dump_gridpoint_field + +!=================================================================================================== + +function detect_mpirun() result(lmpi_required) + logical :: lmpi_required + integer :: ilen + integer, parameter :: nvars = 5 + character(len=32), dimension(nvars) :: cmpirun_detect + character(len=4) :: clenv_dr_hook_assert_mpi_initialized + integer :: ivar + + ! Environment variables that are set when mpirun, srun, aprun, ... are used + cmpirun_detect(1) = 'OMPI_COMM_WORLD_SIZE' ! openmpi + cmpirun_detect(2) = 'ALPS_APP_PE' ! cray pe + cmpirun_detect(3) = 'PMI_SIZE' ! intel + cmpirun_detect(4) = 'SLURM_NTASKS' ! slurm + cmpirun_detect(5) = 'ECTRANS_USE_MPI' ! forced + + lmpi_required = .false. + do ivar = 1, nvars + call get_environment_variable(name=trim(cmpirun_detect(ivar)), length=ilen) + if (ilen > 0) then + lmpi_required = .true. + exit ! break + endif + enddo +end function + +!=================================================================================================== + +! Assign GSTATS labels to the main regions of ecTrans +subroutine gstats_labels + + call gstats_label(0, ' ', 'PROGRAM - Total') + call gstats_label(1, ' ', 'SETUP_TRANS0 - Setup ecTrans') + call gstats_label(2, ' ', 'SETUP_TRANS - Setup ecTrans handle') + call gstats_label(3, ' ', 'TIME STEP - Time step') + call gstats_label(4, ' ', 'INV_TRANS - Inverse transform') + call gstats_label(5, ' ', 'DIR_TRANS - Direct transform') + call gstats_label(6, ' ', 'NORMS - Norm comp. (optional)') + call gstats_label(102, ' ', 'LTINV_CTL - Inv. Legendre transform') + call gstats_label(103, ' ', 'LTDIR_CTL - Dir. Legendre transform') + call gstats_label(106, ' ', 'FTDIR_CTL - Dir. Fourier transform') + call gstats_label(107, ' ', 'FTINV_CTL - Inv. Fourier transform') + call gstats_label(140, ' ', 'SULEG - Comp. of Leg. poly.') + call gstats_label(152, ' ', 'LTINV_CTL - M to L transposition') + call gstats_label(153, ' ', 'LTDIR_CTL - L to M transposition') + call gstats_label(157, ' ', 'FTINV_CTL - L to G transposition') + call gstats_label(158, ' ', 'FTDIR_CTL - G to L transposition') + call gstats_label(400, ' ', 'GSTATS - GSTATS itself') + +end subroutine gstats_labels + +!=================================================================================================== + +subroutine set_ectrans_gpu_nflev(kflev) + use ec_env_mod, only : ec_putenv + integer(kind=jpim), intent(in) :: kflev + character(len=32) :: ECTRANS_GPU_NFLEV + write(ECTRANS_GPU_NFLEV,'(A,I0)') "ECTRANS_GPU_NFLEV=",kflev + call ec_putenv(ECTRANS_GPU_NFLEV, overwrite=.true.) +end subroutine + + +end program transform_test + +!=================================================================================================== diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index 35059838..b84e58be 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -7,7 +7,7 @@ ! nor does it submit to any jurisdiction. ! -program transform_test +program ectrans_benchmark ! ! Spectral transform test @@ -43,7 +43,7 @@ program transform_test use parkind1, only: jpim, jprb, jprd use oml_mod ,only : oml_max_threads use mpl_module -use yomgstats, only: jpmaxstat +use yomgstats, only: jpmaxstat, gstats_lstats => lstats use yomhook, only : dr_hook_init implicit none @@ -291,7 +291,7 @@ program transform_test if (nprtrv > 0 .or. nprtrw > 0) then if (nprtrv == 0) nprtrv = nproc/nprtrw if (nprtrw == 0) nprtrw = nproc/nprtrv - if (nprtrw*nprtrv /= nproc) call abor1('transform_test:nprtrw*nprtrv /= nproc') + if (nprtrw*nprtrv /= nproc) call abor1('ectrans_benchmark:nprtrw*nprtrv /= nproc') else do jprtrv = 4, nproc nprtrv = jprtrv @@ -328,7 +328,7 @@ program transform_test iprtrv = mod(myproc - 1, nprtrv) + 1 iprtrw = (myproc - 1)/nprtrv + 1 if (iprtrv /= mysetv .or. iprtrw /= mysetw) then - call abor1('transform_test:inconsistency when computing mysetw and mysetv') + call abor1('ectrans_benchmark:inconsistency when computing mysetw and mysetv') endif endif @@ -383,9 +383,13 @@ program transform_test call gstats(1, 1) call gstats(2, 0) -call setup_trans(ksmax=nsmax, kdgl=ndgl, kloen=nloen, ldsplit=.true., & - & lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & - & lduseflt=luseflt) +! IFS spectral fields are dimensioned NFLEVL, Nils !! +call set_ectrans_gpu_nflev(nflevl) + ! We pass nflevl via environment variable in order not to change API + ! In long run, ectrans should grow its internal buffers automatically +call setup_trans(ksmax=nsmax, kdgl=ndgl, kloen=nloen, ldsplit=.true., & + & lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & + & lduseflt=luseflt) call gstats(2, 1) call trans_inq(kspec2=nspec2, kspec2g=nspec2g, kgptot=ngptot, kgptotg=ngptotg) @@ -402,7 +406,7 @@ program transform_test !=================================================================================================== ! Print configuration details -if (verbosity >= 0) then +if (verbosity >= 0 .and. myproc == 1) then write(nout,'(" ")') write(nout,'(a)')'======= Start of runtime parameters =======' write(nout,'(" ")') @@ -530,7 +534,7 @@ program transform_test endif call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp1, kvset=ivsetsc) - if (verbosity >= 1) then + if (verbosity >= 1 .and. myproc == 1) then do ifld = 1, nflevg write(nout,'("norm zspvor( ",i4,",:) = ",f20.15)') ifld, znormvor1(ifld) write(nout,'("0x",Z16.16)') znormvor1(ifld) @@ -558,18 +562,18 @@ program transform_test ztinit = (timef() - ztinit)/1000.0_jprd -if (verbosity >= 0) then +if (verbosity >= 0 .and. myproc == 1) then write(nout,'(" ")') - write(nout,'(a,i6,a,f9.2,a)') "transform_test initialisation, on",nproc,& + write(nout,'(a,i6,a,f9.2,a)') "ectrans_benchmark initialisation, on",nproc,& & " tasks, took",ztinit," sec" write(nout,'(" ")') endif -if (iters <= 0) call abor1('transform_test:iters <= 0') +if (iters <= 0) call abor1('ectrans_benchmark:iters <= 0') -allocate(ztstep(iters)) -allocate(ztstep1(iters)) -allocate(ztstep2(iters)) +allocate(ztstep(iters+2)) +allocate(ztstep1(iters+2)) +allocate(ztstep2(iters+2)) ztstepavg = 0._jprd ztstepmax = 0._jprd @@ -581,8 +585,10 @@ program transform_test ztstepmax2 = 0._jprd ztstepmin2 = 9999999999999999._jprd -write(nout,'(a)') '======= Start of spectral transforms =======' -write(nout,'(" ")') +if (verbosity >= 1 .and. myproc == 1) then + write(nout,'(a)') '======= Start of spectral transforms =======' + write(nout,'(" ")') +endif ztloop = timef() @@ -590,7 +596,14 @@ program transform_test ! Do spectral transform loop !=================================================================================================== -do jstep = 1, iters +gstats_lstats = .false. + +write(nout,'(a,i5,a)') 'Running for ', iters, ' iterations with 2 extra warm-up iterations' +write(nout,'(" ")') + +do jstep = 1, iters+2 + if (jstep == 3) gstats_lstats = .true. + call gstats(3,0) ztstep(jstep) = timef() @@ -911,11 +924,11 @@ program transform_test &" 1",11x,i10) do i = 2, nproc - call mpl_recv(istack, ksource=nprcids(i), ktag=i, cdstring='transform_test:') + call mpl_recv(istack, ksource=nprcids(i), ktag=i, cdstring='ectrans_benchmark:') print '(i4,11x,i10)', i, istack enddo else - call mpl_send(istack, kdest=nprcids(1), ktag=myproc, cdstring='transform_test:') + call mpl_send(istack, kdest=nprcids(1), ktag=myproc, cdstring='ectrans_benchmark:') endif endif @@ -1071,6 +1084,10 @@ subroutine get_command_line_arguments(nsmax, cgrid, iters, nfld, nlev, lvordiv, character(len=128) :: carg ! Storage variable for command line arguments integer :: iarg = 1 ! Argument index +#ifdef ACCGPU + !$acc init +#endif + do while (iarg <= command_argument_count()) call get_command_argument(iarg, carg) @@ -1193,9 +1210,9 @@ subroutine print_help(unit) write(nout, "(a)") "" if (jprb == jprd) then - write(nout, "(a)") "NAME ectrans-benchmark-dp" + write(nout, "(a)") "NAME ectrans-benchmark-" // VERSION // "-dp" else - write(nout, "(a)") "NAME ectrans-benchmark-sp" + write(nout, "(a)") "NAME ectrans-benchmark-" // VERSION // "-sp" end if write(nout, "(a)") "" @@ -1211,9 +1228,9 @@ subroutine print_help(unit) write(nout, "(a)") "USAGE" if (jprb == jprd) then - write(nout, "(a)") " ectrans-benchmark-dp [options]" + write(nout, "(a)") " ectrans-benchmark-" // VERSION // "-dp [options]" else - write(nout, "(a)") " ectrans-benchmark-sp [options]" + write(nout, "(a)") " ectrans-benchmark-" // VERSION // "-sp [options]" end if write(nout, "(a)") "" @@ -1348,11 +1365,12 @@ end subroutine dump_gridpoint_field !=================================================================================================== function detect_mpirun() result(lmpi_required) + use ec_env_mod, only : ec_putenv logical :: lmpi_required integer :: ilen - integer, parameter :: nvars = 5 + integer, parameter :: nvars = 4 character(len=32), dimension(nvars) :: cmpirun_detect -! character(len=4) :: clenv_dr_hook_assert_mpi_initialized + character(len=4) :: clenv integer :: ivar ! Environment variables that are set when mpirun, srun, aprun, ... are used @@ -1360,7 +1378,6 @@ function detect_mpirun() result(lmpi_required) cmpirun_detect(2) = 'ALPS_APP_PE' ! cray pe cmpirun_detect(3) = 'PMI_SIZE' ! intel cmpirun_detect(4) = 'SLURM_NTASKS' ! slurm - cmpirun_detect(5) = 'ECTRANS_USE_MPI' ! forced lmpi_required = .false. do ivar = 1, nvars @@ -1370,6 +1387,15 @@ function detect_mpirun() result(lmpi_required) exit ! break endif enddo + + call get_environment_variable(name="ECTRANS_USE_MPI", value=clenv, length=ilen ) + if (ilen > 0) then + lmpi_required = .true. + if( trim(clenv) == "0" .or. trim(clenv) == "OFF" .or. trim(CLENV) == "off" .or. trim(clenv) == "F" ) then + lmpi_required = .false. + endif + call ec_putenv("DR_HOOK_ASSERT_MPI_INITIALIZED=0", overwrite=.true.) + endif end function !=================================================================================================== @@ -1397,6 +1423,16 @@ subroutine gstats_labels end subroutine gstats_labels -end program transform_test +!=================================================================================================== + +subroutine set_ectrans_gpu_nflev(kflev) + use ec_env_mod, only : ec_putenv + integer(kind=jpim), intent(in) :: kflev + character(len=32) :: ECTRANS_GPU_NFLEV + write(ECTRANS_GPU_NFLEV,'(A,I0)') "ECTRANS_GPU_NFLEV=",kflev + call ec_putenv(ECTRANS_GPU_NFLEV, overwrite=.true.) +end subroutine + +end program ectrans_benchmark !=================================================================================================== diff --git a/src/trans/CMakeLists.txt b/src/trans/CMakeLists.txt index 38c2ad47..87db1230 100644 --- a/src/trans/CMakeLists.txt +++ b/src/trans/CMakeLists.txt @@ -6,256 +6,9 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. -# Preprocess module file containing version information -configure_file( internal/ectrans_version_mod.F90.in internal/ectrans_version_mod.F90 ) - -## Apply workarounds for some known compilers - -if(CMAKE_Fortran_COMPILER_ID MATCHES "Cray") - if( CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER_EQUAL 8.7 ) - - # Fix for IFS "CONGRAD: SPTSV/DPTSV returned non-zero info with crayftn 8.7.7 (cdt/18.12) - ectrans_add_compile_options( - SOURCES internal/ftinv_ctlad_mod.F90 - FLAGS "-O0,fp1,omp") - - endif() -endif() - -if(CMAKE_Fortran_COMPILER_ID MATCHES "Intel") - if( CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER_EQUAL 18 AND CMAKE_Fortran_COMPILER_VERSION VERSION_LESS 19 ) - - # See https://github.com/ecmwf-ifs/ectrans/issues/17 - ectrans_add_compile_options( - SOURCES algor/butterfly_alg_mod.F90 - FLAGS "-check nopointers") - endif() +if( HAVE_CPU) + add_subdirectory( cpu ) endif() - -## Sources which are precision independent can go into a common library -list( APPEND ectrans_common_src - algor/ectrans_blas_mod.F90 - sharedmem/sharedmem_mod.F90 - sharedmem/sharedmem.c - internal/abort_trans_mod.F90 - internal/cpledn_mod.F90 - internal/gawl_mod.F90 - internal/sugaw_mod.F90 - internal/supol_mod.F90 - internal/supolf_mod.F90 - internal/tpm_constants.F90 - internal/tpm_ctl.F90 - internal/tpm_dim.F90 - internal/tpm_gen.F90 - internal/tpm_geometry.F90 - internal/tpm_pol.F90 - internal/tpm_distr.F90 - internal/pe2set_mod.F90 - internal/set2pe_mod.F90 - internal/eq_regions_mod.F90 - internal/sump_trans0_mod.F90 - internal/sustaonl_mod.F90 - internal/sumplat_mod.F90 - internal/sumplatb_mod.F90 - internal/sumplatbeq_mod.F90 - internal/sumplatf_mod.F90 - internal/mysendset_mod.F90 - internal/myrecvset_mod.F90 - internal/suwavedi_mod.F90 - internal/sump_trans_preleg_mod.F90 - external/get_current.F90 - external/setup_trans0.F90 - ${CMAKE_CURRENT_BINARY_DIR}/internal/ectrans_version_mod.F90 -) -list( APPEND ectrans_common_includes - include/ectrans/get_current.h - include/ectrans/setup_trans0.h -) - -ecbuild_add_library( - TARGET ectrans_common - LINKER_LANGUAGE Fortran - SOURCES ${ectrans_common_src} - PUBLIC_LIBS fiat - PRIVATE_LIBS ${LAPACK_LIBRARIES} -) -ectrans_target_fortran_module_directory( - TARGET ectrans_common - MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/module/ectrans - INSTALL_DIRECTORY module/ectrans -) - -if( HAVE_OMP ) - ecbuild_debug("target_link_libraries( trans_${prec} PRIVATE OpenMP::OpenMP_Fortran )") - target_link_libraries( ectrans_common PRIVATE OpenMP::OpenMP_Fortran ) +if( HAVE_GPU ) + add_subdirectory( gpu ) endif() - - - -function(generate_file) - set (options) - set (oneValueArgs INPUT OUTPUT BACKEND) - set (multiValueArgs) - cmake_parse_arguments(_PAR "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(output ${_PAR_OUTPUT}) - set(input ${_PAR_INPUT}) - set(backend ${_PAR_BACKEND}) - - set( JPRB_dp JPRD ) - set( jprb_dp jprd ) - set( JPRB_sp JPRM ) - set( jprb_sp jprm ) - - add_custom_command( - OUTPUT ${output} - COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/sedrenames.txt | - sed -e "s/VARIANTDESIGNATOR/${backend}/g" | - sed -e "s/TYPEDESIGNATOR_UPPER/${JPRB_${backend}}/g" | - sed -e "s/TYPEDESIGNATOR_LOWER/${jprb_${backend}}/g" | - sed -rf - ${CMAKE_CURRENT_SOURCE_DIR}/${input} > ${output} - DEPENDS ${input} ${CMAKE_CURRENT_SOURCE_DIR}/sedrenames.txt - COMMENT "Generating ${output}" - VERBATIM - ) -endfunction(generate_file) - - -function(generate_backend_includes) - set (options) - set (oneValueArgs BACKEND TARGET DESTINATION) - set (multiValueArgs) - cmake_parse_arguments(_PAR "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(destination ${_PAR_DESTINATION} ) - set(backend ${_PAR_BACKEND}) - - file(MAKE_DIRECTORY ${destination}) - file(MAKE_DIRECTORY ${destination}/trans_${backend}) - - ecbuild_list_add_pattern( LIST files GLOB include/ectrans/*.h QUIET ) - - set( outfiles ) - foreach(file_i ${files}) - get_filename_component(outfile_name ${file_i} NAME) - get_filename_component(outfile_name_we ${file_i} NAME_WE) - get_filename_component(outfile_ext ${file_i} EXT) - get_filename_component(outfile_dir ${file_i} DIRECTORY) - if (${file_i} IN_LIST ectrans_common_includes) - configure_file(${file_i} ${destination}/${outfile_name}) - else() - set(outfile "${destination}/${outfile_name_we}_${backend}${outfile_ext}") - ecbuild_debug("Generate ${outfile}") - generate_file(BACKEND ${backend} INPUT ${file_i} OUTPUT ${outfile}) - list(APPEND outfiles ${outfile}) - string(TOUPPER ${outfile_name_we} OUTFILE_NAME_WE ) - ecbuild_debug("Generate ${destination}/trans_${backend}/${outfile_name}") - file(WRITE ${destination}/trans_${backend}/${outfile_name} "! Automatically generated interface header for backward compatibility of generic symbols !\n") - file(APPEND ${destination}/trans_${backend}/${outfile_name} "#if defined(${outfile_name_we})\n") - file(APPEND ${destination}/trans_${backend}/${outfile_name} "#undef ${outfile_name_we}\n") - file(APPEND ${destination}/trans_${backend}/${outfile_name} "#endif\n") - file(APPEND ${destination}/trans_${backend}/${outfile_name} "#if defined(${OUTFILE_NAME_WE})\n") - file(APPEND ${destination}/trans_${backend}/${outfile_name} "#undef ${OUTFILE_NAME_WE}\n") - file(APPEND ${destination}/trans_${backend}/${outfile_name} "#endif\n") - file(APPEND ${destination}/trans_${backend}/${outfile_name} "#include \"${outfile_name_we}_${backend}${outfile_ext}\"\n") - file(APPEND ${destination}/trans_${backend}/${outfile_name} "#define ${outfile_name_we} ${OUTFILE_NAME_WE}_${backend}\n") - file(APPEND ${destination}/trans_${backend}/${outfile_name} "#define ${OUTFILE_NAME_WE} ${OUTFILE_NAME_WE}_${backend}\n") - endif() - endforeach(file_i) - - add_custom_target(${_PAR_TARGET}_generate DEPENDS ${outfiles}) - ecbuild_add_library(TARGET ${_PAR_TARGET} TYPE INTERFACE) - add_dependencies(${_PAR_TARGET} ${_PAR_TARGET}_generate) - target_include_directories(${_PAR_TARGET} INTERFACE $) -endfunction(generate_backend_includes) - - - -function(generate_backend_sources) - set (options) - set (oneValueArgs BACKEND DESTINATION OUTPUT) - set (multiValueArgs) - - cmake_parse_arguments(_PAR "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(backend ${_PAR_BACKEND}) - set(destination ${_PAR_DESTINATION}) - file(MAKE_DIRECTORY ${destination}/algor) - file(MAKE_DIRECTORY ${destination}/internal) - file(MAKE_DIRECTORY ${destination}/external) - - ecbuild_list_add_pattern( LIST files - GLOB - algor/* - internal/* - external/* - QUIET - ) - - set(outfiles) - foreach(file_i ${files}) - if(NOT (${file_i} IN_LIST ectrans_common_src)) - get_filename_component(outfile_name ${file_i} NAME) - get_filename_component(outfile_name_we ${file_i} NAME_WE) - get_filename_component(outfile_ext ${file_i} EXT) - get_filename_component(outfile_dir ${file_i} DIRECTORY) - set(outfile "${destination}/${file_i}") - ecbuild_debug("Generate ${outfile}") - generate_file(BACKEND ${backend} INPUT ${file_i} OUTPUT ${outfile}) - list(APPEND outfiles ${outfile}) - endif() - endforeach(file_i) - set(${_PAR_OUTPUT} ${outfiles} PARENT_SCOPE) -endfunction(generate_backend_sources) - -set( BUILD_INTERFACE_INCLUDE_DIR ${CMAKE_BINARY_DIR}/include/ectrans ) - -foreach( prec dp sp ) - if( HAVE_${prec} ) - - generate_backend_includes(BACKEND ${prec} TARGET ectrans_${prec}_includes DESTINATION ${BUILD_INTERFACE_INCLUDE_DIR} ) - generate_backend_sources( BACKEND ${prec} OUTPUT ectrans_${prec}_src DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/generated/ectrans_${prec}) - ecbuild_add_library( - TARGET ectrans_${prec} - LINKER_LANGUAGE Fortran - SOURCES ${ectrans_${prec}_src} - PUBLIC_INCLUDES $ - $ - $ - PUBLIC_LIBS ectrans_common ectrans_${prec}_includes - ) - - ectrans_target_fortran_module_directory( - TARGET ectrans_${prec} - MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/module/ectrans_${prec} - INSTALL_DIRECTORY module/ectrans_${prec} - ) - target_link_libraries( ectrans_${prec} PUBLIC fiat) - - set( FFTW_LINK PRIVATE ) - if( LAPACK_LIBRARIES MATCHES "mkl" AND NOT FFTW_LIBRARIES MATCHES "mkl" ) - ecbuild_warn( "Danger: Both MKL and FFTW are linked in trans_${prec}. " - "No guarantees on link order can be made for the final executable.") - set( FFTW_LINK PUBLIC ) # Attempt anyway to give FFTW precedence - endif() - ecbuild_debug("target_link_libraries( trans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} )") - target_link_libraries( ectrans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} ) - target_include_directories( ectrans_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) - target_compile_definitions( ectrans_${prec} PRIVATE WITH_FFTW ) - ecbuild_debug("target_link_libraries( ectrans_${prec} PRIVATE ${LAPACK_LIBRARIES} )") - target_link_libraries( ectrans_${prec} PRIVATE ${LAPACK_LIBRARIES} ) - - if( HAVE_OMP ) - ecbuild_debug("target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran )") - target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran ) - endif() - - # This interface library is for backward compatibility, and provides the older includes - ecbuild_add_library( TARGET trans_${prec} TYPE INTERFACE ) - target_include_directories( trans_${prec} INTERFACE $ ) - target_include_directories( trans_${prec} INTERFACE $ ) - target_link_libraries( trans_${prec} INTERFACE fiat ectrans_${prec} parkind_${prec}) - endif() -endforeach() - -## Install trans interface -install( DIRECTORY ${BUILD_INTERFACE_INCLUDE_DIR}/ DESTINATION include/ectrans ) diff --git a/src/trans/cpu/CMakeLists.txt b/src/trans/cpu/CMakeLists.txt new file mode 100644 index 00000000..01f5bcb8 --- /dev/null +++ b/src/trans/cpu/CMakeLists.txt @@ -0,0 +1,265 @@ +# (C) Copyright 2020- ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +# Preprocess module file containing version information +configure_file( internal/ectrans_version_mod.F90.in internal/ectrans_version_mod.F90 ) + +## Apply workarounds for some known compilers + +if(CMAKE_Fortran_COMPILER_ID MATCHES "Cray") + if( CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER_EQUAL 8.7 ) + + # Fix for IFS "CONGRAD: SPTSV/DPTSV returned non-zero info with crayftn 8.7.7 (cdt/18.12) + ectrans_add_compile_options( + SOURCES internal/ftinv_ctlad_mod.F90 + FLAGS "-O0,fp1,omp") + + endif() +endif() + +if(CMAKE_Fortran_COMPILER_ID MATCHES "Intel") + if( CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER_EQUAL 18 AND CMAKE_Fortran_COMPILER_VERSION VERSION_LESS 19 ) + + # See https://github.com/ecmwf-ifs/ectrans/issues/17 + ectrans_add_compile_options( + SOURCES algor/butterfly_alg_mod.F90 + FLAGS "-check nopointers") + endif() +endif() + +## Sources which are precision independent can go into a common library +list( APPEND ectrans_common_src + algor/ectrans_blas_mod.F90 + sharedmem/sharedmem_mod.F90 + sharedmem/sharedmem.c + internal/abort_trans_mod.F90 + internal/cpledn_mod.F90 + internal/gawl_mod.F90 + internal/sugaw_mod.F90 + internal/supol_mod.F90 + internal/supolf_mod.F90 + internal/tpm_constants.F90 + internal/tpm_ctl.F90 + internal/tpm_dim.F90 + internal/tpm_gen.F90 + internal/tpm_geometry.F90 + internal/tpm_pol.F90 + internal/tpm_distr.F90 + internal/pe2set_mod.F90 + internal/set2pe_mod.F90 + internal/eq_regions_mod.F90 + internal/sump_trans0_mod.F90 + internal/sustaonl_mod.F90 + internal/sumplat_mod.F90 + internal/sumplatb_mod.F90 + internal/sumplatbeq_mod.F90 + internal/sumplatf_mod.F90 + internal/mysendset_mod.F90 + internal/myrecvset_mod.F90 + internal/suwavedi_mod.F90 + internal/sump_trans_preleg_mod.F90 + external/get_current.F90 + external/setup_trans0.F90 + ${CMAKE_CURRENT_BINARY_DIR}/internal/ectrans_version_mod.F90 +) +list( APPEND ectrans_common_includes + ectrans/get_current.h + ectrans/setup_trans0.h +) + +ecbuild_add_library( + TARGET ectrans_common + LINKER_LANGUAGE Fortran + SOURCES ${ectrans_common_src} + PUBLIC_LIBS fiat + PRIVATE_LIBS ${LAPACK_LIBRARIES} +) +ectrans_target_fortran_module_directory( + TARGET ectrans_common + MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/module/ectrans + INSTALL_DIRECTORY module/ectrans +) + +if( HAVE_OMP ) + ecbuild_debug("target_link_libraries( trans_${prec} PRIVATE OpenMP::OpenMP_Fortran )") + target_link_libraries( ectrans_common PRIVATE OpenMP::OpenMP_Fortran ) +endif() + + + +function(generate_file) + set (options) + set (oneValueArgs INPUT OUTPUT BACKEND) + set (multiValueArgs) + cmake_parse_arguments(_PAR "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(output ${_PAR_OUTPUT}) + set(input ${_PAR_INPUT}) + set(backend ${_PAR_BACKEND}) + + set( JPRB_dp JPRD ) + set( jprb_dp jprd ) + set( JPRB_sp JPRM ) + set( jprb_sp jprm ) + + add_custom_command( + OUTPUT ${output} + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/sedrenames.txt | + sed -e "s/VARIANTDESIGNATOR/${backend}/g" | + sed -e "s/TYPEDESIGNATOR_UPPER/${JPRB_${backend}}/g" | + sed -e "s/TYPEDESIGNATOR_LOWER/${jprb_${backend}}/g" | + sed -rf - ${input} > ${output} + DEPENDS ${input} ${CMAKE_CURRENT_SOURCE_DIR}/sedrenames.txt + COMMENT "Generating ${output}" + VERBATIM + ) +endfunction(generate_file) + + +function(generate_backend_includes) + set (options) + set (oneValueArgs BACKEND TARGET DESTINATION INCLUDE_DIRECTORY) + set (multiValueArgs) + cmake_parse_arguments(_PAR "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(destination ${_PAR_DESTINATION} ) + set(backend ${_PAR_BACKEND}) + + file(MAKE_DIRECTORY ${destination}) + file(MAKE_DIRECTORY ${destination}/trans_${backend}) + + ecbuild_list_add_pattern( LIST absolute_files GLOB ectrans/*.h SOURCE_DIR ${_PAR_INCLUDE_DIRECTORY} QUIET ) + set( files ) + foreach(file_i ${absolute_files}) + file(RELATIVE_PATH file_i ${_PAR_INCLUDE_DIRECTORY} ${file_i}) + list(APPEND files ${file_i}) + endforeach() + set( outfiles ) + foreach(file_i ${files}) + get_filename_component(outfile_name ${file_i} NAME) + get_filename_component(outfile_name_we ${file_i} NAME_WE) + get_filename_component(outfile_ext ${file_i} EXT) + get_filename_component(outfile_dir ${file_i} DIRECTORY) + if (${file_i} IN_LIST ectrans_common_includes) + configure_file(${_PAR_INCLUDE_DIRECTORY}/${file_i} ${destination}/${outfile_name}) + else() + set(outfile "${destination}/${outfile_name_we}_${backend}${outfile_ext}") + ecbuild_debug("Generate ${outfile}") + generate_file(BACKEND ${backend} INPUT ${_PAR_INCLUDE_DIRECTORY}/${file_i} OUTPUT ${outfile}) + list(APPEND outfiles ${outfile}) + string(TOUPPER ${outfile_name_we} OUTFILE_NAME_WE ) + ecbuild_debug("Generate ${destination}/trans_${backend}/${outfile_name}") + file(WRITE ${destination}/trans_${backend}/${outfile_name} "! Automatically generated interface header for backward compatibility of generic symbols !\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#if defined(${outfile_name_we})\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#undef ${outfile_name_we}\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#endif\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#if defined(${OUTFILE_NAME_WE})\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#undef ${OUTFILE_NAME_WE}\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#endif\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#include \"${outfile_name_we}_${backend}${outfile_ext}\"\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#define ${outfile_name_we} ${OUTFILE_NAME_WE}_${backend}\n") + file(APPEND ${destination}/trans_${backend}/${outfile_name} "#define ${OUTFILE_NAME_WE} ${OUTFILE_NAME_WE}_${backend}\n") + endif() + endforeach(file_i) + + add_custom_target(${_PAR_TARGET}_generate DEPENDS ${outfiles}) + ecbuild_add_library(TARGET ${_PAR_TARGET} TYPE INTERFACE) + add_dependencies(${_PAR_TARGET} ${_PAR_TARGET}_generate) + target_include_directories(${_PAR_TARGET} INTERFACE $) +endfunction(generate_backend_includes) + + + +function(generate_backend_sources) + set (options) + set (oneValueArgs BACKEND DESTINATION OUTPUT) + set (multiValueArgs) + + cmake_parse_arguments(_PAR "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(backend ${_PAR_BACKEND}) + set(destination ${_PAR_DESTINATION}) + file(MAKE_DIRECTORY ${destination}/algor) + file(MAKE_DIRECTORY ${destination}/internal) + file(MAKE_DIRECTORY ${destination}/external) + + ecbuild_list_add_pattern( LIST files + GLOB + algor/* + internal/* + external/* + QUIET + ) + + set(outfiles) + foreach(file_i ${files}) + if(NOT (${file_i} IN_LIST ectrans_common_src)) + get_filename_component(outfile_name ${file_i} NAME) + get_filename_component(outfile_name_we ${file_i} NAME_WE) + get_filename_component(outfile_ext ${file_i} EXT) + get_filename_component(outfile_dir ${file_i} DIRECTORY) + set(outfile "${destination}/${file_i}") + ecbuild_debug("Generate ${outfile}") + generate_file(BACKEND ${backend} INPUT ${CMAKE_CURRENT_SOURCE_DIR}/${file_i} OUTPUT ${outfile}) + list(APPEND outfiles ${outfile}) + endif() + endforeach(file_i) + set(${_PAR_OUTPUT} ${outfiles} PARENT_SCOPE) +endfunction(generate_backend_sources) + +set( BUILD_INTERFACE_INCLUDE_DIR ${CMAKE_BINARY_DIR}/include/ectrans ) + +foreach( prec dp sp ) + if( HAVE_${prec} ) + + generate_backend_includes(BACKEND ${prec} TARGET ectrans_${prec}_includes DESTINATION ${BUILD_INTERFACE_INCLUDE_DIR} INCLUDE_DIRECTORY ${PROJECT_SOURCE_DIR}/src/trans/include ) + generate_backend_sources( BACKEND ${prec} OUTPUT ectrans_${prec}_src DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/generated/ectrans_${prec}) + ecbuild_add_library( + TARGET ectrans_${prec} + LINKER_LANGUAGE Fortran + SOURCES ${ectrans_${prec}_src} + PUBLIC_INCLUDES $ + $ + $ + PUBLIC_LIBS ectrans_common ectrans_${prec}_includes + ) + + ectrans_target_fortran_module_directory( + TARGET ectrans_${prec} + MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/module/ectrans_${prec} + INSTALL_DIRECTORY module/ectrans_${prec} + ) + target_link_libraries( ectrans_${prec} PUBLIC fiat) + + set( FFTW_LINK PRIVATE ) + if( LAPACK_LIBRARIES MATCHES "mkl" AND NOT FFTW_LIBRARIES MATCHES "mkl" ) + ecbuild_warn( "Danger: Both MKL and FFTW are linked in trans_${prec}. " + "No guarantees on link order can be made for the final executable.") + set( FFTW_LINK PUBLIC ) # Attempt anyway to give FFTW precedence + endif() + ecbuild_debug("target_link_libraries( trans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} )") + target_link_libraries( ectrans_${prec} ${FFTW_LINK} ${FFTW_LIBRARIES} ) + target_include_directories( ectrans_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) + target_compile_definitions( ectrans_${prec} PRIVATE WITH_FFTW ) + ecbuild_debug("target_link_libraries( ectrans_${prec} PRIVATE ${LAPACK_LIBRARIES} )") + target_link_libraries( ectrans_${prec} PRIVATE ${LAPACK_LIBRARIES} ) + + if( HAVE_OMP ) + ecbuild_debug("target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran )") + target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran ) + endif() + + # This interface library is for backward compatibility, and provides the older includes + ecbuild_add_library( TARGET trans_${prec} TYPE INTERFACE ) + target_include_directories( trans_${prec} INTERFACE $ ) + target_include_directories( trans_${prec} INTERFACE $ ) + target_link_libraries( trans_${prec} INTERFACE fiat ectrans_${prec} parkind_${prec}) + endif() +endforeach() + +## Install trans interface +install( DIRECTORY ${BUILD_INTERFACE_INCLUDE_DIR}/ DESTINATION include/ectrans ) diff --git a/src/trans/algor/butterfly_alg_mod.F90 b/src/trans/cpu/algor/butterfly_alg_mod.F90 similarity index 100% rename from src/trans/algor/butterfly_alg_mod.F90 rename to src/trans/cpu/algor/butterfly_alg_mod.F90 diff --git a/src/trans/algor/ectrans_blas_mod.F90 b/src/trans/cpu/algor/ectrans_blas_mod.F90 similarity index 100% rename from src/trans/algor/ectrans_blas_mod.F90 rename to src/trans/cpu/algor/ectrans_blas_mod.F90 diff --git a/src/trans/algor/interpol_decomp_mod.F90 b/src/trans/cpu/algor/interpol_decomp_mod.F90 similarity index 100% rename from src/trans/algor/interpol_decomp_mod.F90 rename to src/trans/cpu/algor/interpol_decomp_mod.F90 diff --git a/src/trans/algor/seefmm_mix.F90 b/src/trans/cpu/algor/seefmm_mix.F90 similarity index 100% rename from src/trans/algor/seefmm_mix.F90 rename to src/trans/cpu/algor/seefmm_mix.F90 diff --git a/src/trans/algor/wts500_mod.F90 b/src/trans/cpu/algor/wts500_mod.F90 similarity index 100% rename from src/trans/algor/wts500_mod.F90 rename to src/trans/cpu/algor/wts500_mod.F90 diff --git a/src/trans/external/dir_trans.F90 b/src/trans/cpu/external/dir_trans.F90 similarity index 100% rename from src/trans/external/dir_trans.F90 rename to src/trans/cpu/external/dir_trans.F90 diff --git a/src/trans/external/dir_transad.F90 b/src/trans/cpu/external/dir_transad.F90 similarity index 100% rename from src/trans/external/dir_transad.F90 rename to src/trans/cpu/external/dir_transad.F90 diff --git a/src/trans/external/dist_grid.F90 b/src/trans/cpu/external/dist_grid.F90 similarity index 100% rename from src/trans/external/dist_grid.F90 rename to src/trans/cpu/external/dist_grid.F90 diff --git a/src/trans/external/dist_grid_32.F90 b/src/trans/cpu/external/dist_grid_32.F90 similarity index 100% rename from src/trans/external/dist_grid_32.F90 rename to src/trans/cpu/external/dist_grid_32.F90 diff --git a/src/trans/external/dist_spec.F90 b/src/trans/cpu/external/dist_spec.F90 similarity index 100% rename from src/trans/external/dist_spec.F90 rename to src/trans/cpu/external/dist_spec.F90 diff --git a/src/trans/external/gath_grid.F90 b/src/trans/cpu/external/gath_grid.F90 similarity index 100% rename from src/trans/external/gath_grid.F90 rename to src/trans/cpu/external/gath_grid.F90 diff --git a/src/trans/external/gath_grid_32.F90 b/src/trans/cpu/external/gath_grid_32.F90 similarity index 100% rename from src/trans/external/gath_grid_32.F90 rename to src/trans/cpu/external/gath_grid_32.F90 diff --git a/src/trans/external/gath_spec.F90 b/src/trans/cpu/external/gath_spec.F90 similarity index 100% rename from src/trans/external/gath_spec.F90 rename to src/trans/cpu/external/gath_spec.F90 diff --git a/src/trans/external/get_current.F90 b/src/trans/cpu/external/get_current.F90 similarity index 100% rename from src/trans/external/get_current.F90 rename to src/trans/cpu/external/get_current.F90 diff --git a/src/trans/external/gpnorm_trans.F90 b/src/trans/cpu/external/gpnorm_trans.F90 similarity index 100% rename from src/trans/external/gpnorm_trans.F90 rename to src/trans/cpu/external/gpnorm_trans.F90 diff --git a/src/trans/external/ini_spec_dist.F90 b/src/trans/cpu/external/ini_spec_dist.F90 similarity index 100% rename from src/trans/external/ini_spec_dist.F90 rename to src/trans/cpu/external/ini_spec_dist.F90 diff --git a/src/trans/external/inv_trans.F90 b/src/trans/cpu/external/inv_trans.F90 similarity index 100% rename from src/trans/external/inv_trans.F90 rename to src/trans/cpu/external/inv_trans.F90 diff --git a/src/trans/external/inv_transad.F90 b/src/trans/cpu/external/inv_transad.F90 similarity index 100% rename from src/trans/external/inv_transad.F90 rename to src/trans/cpu/external/inv_transad.F90 diff --git a/src/trans/external/setup_trans.F90 b/src/trans/cpu/external/setup_trans.F90 similarity index 100% rename from src/trans/external/setup_trans.F90 rename to src/trans/cpu/external/setup_trans.F90 diff --git a/src/trans/external/setup_trans0.F90 b/src/trans/cpu/external/setup_trans0.F90 similarity index 99% rename from src/trans/external/setup_trans0.F90 rename to src/trans/cpu/external/setup_trans0.F90 index cdd581f1..1861b2dc 100644 --- a/src/trans/external/setup_trans0.F90 +++ b/src/trans/cpu/external/setup_trans0.F90 @@ -145,6 +145,7 @@ SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& WRITE(NOUT,'(A)') WRITE(NOUT,'(A)') "ecTrans at version: " // ECTRANS_VERSION_STR() WRITE(NOUT,'(A)') "commit: " // ECTRANS_GIT_SHA1() +WRITE(NOUT,'(A)') "CPU version" WRITE(NOUT,'(A)') LLP1 = NPRINTLEV>0 diff --git a/src/trans/external/specnorm.F90 b/src/trans/cpu/external/specnorm.F90 similarity index 100% rename from src/trans/external/specnorm.F90 rename to src/trans/cpu/external/specnorm.F90 diff --git a/src/trans/external/trans_end.F90 b/src/trans/cpu/external/trans_end.F90 similarity index 100% rename from src/trans/external/trans_end.F90 rename to src/trans/cpu/external/trans_end.F90 diff --git a/src/trans/external/trans_inq.F90 b/src/trans/cpu/external/trans_inq.F90 similarity index 100% rename from src/trans/external/trans_inq.F90 rename to src/trans/cpu/external/trans_inq.F90 diff --git a/src/trans/external/trans_pnm.F90 b/src/trans/cpu/external/trans_pnm.F90 similarity index 100% rename from src/trans/external/trans_pnm.F90 rename to src/trans/cpu/external/trans_pnm.F90 diff --git a/src/trans/external/trans_release.F90 b/src/trans/cpu/external/trans_release.F90 similarity index 100% rename from src/trans/external/trans_release.F90 rename to src/trans/cpu/external/trans_release.F90 diff --git a/src/trans/external/vordiv_to_uv.F90 b/src/trans/cpu/external/vordiv_to_uv.F90 similarity index 100% rename from src/trans/external/vordiv_to_uv.F90 rename to src/trans/cpu/external/vordiv_to_uv.F90 diff --git a/src/trans/internal/abort_trans_mod.F90 b/src/trans/cpu/internal/abort_trans_mod.F90 similarity index 100% rename from src/trans/internal/abort_trans_mod.F90 rename to src/trans/cpu/internal/abort_trans_mod.F90 diff --git a/src/trans/internal/asre1_mod.F90 b/src/trans/cpu/internal/asre1_mod.F90 similarity index 100% rename from src/trans/internal/asre1_mod.F90 rename to src/trans/cpu/internal/asre1_mod.F90 diff --git a/src/trans/internal/asre1ad_mod.F90 b/src/trans/cpu/internal/asre1ad_mod.F90 similarity index 100% rename from src/trans/internal/asre1ad_mod.F90 rename to src/trans/cpu/internal/asre1ad_mod.F90 diff --git a/src/trans/internal/asre1b_mod.F90 b/src/trans/cpu/internal/asre1b_mod.F90 similarity index 100% rename from src/trans/internal/asre1b_mod.F90 rename to src/trans/cpu/internal/asre1b_mod.F90 diff --git a/src/trans/internal/asre1bad_mod.F90 b/src/trans/cpu/internal/asre1bad_mod.F90 similarity index 100% rename from src/trans/internal/asre1bad_mod.F90 rename to src/trans/cpu/internal/asre1bad_mod.F90 diff --git a/src/trans/internal/cdmap_mod.F90 b/src/trans/cpu/internal/cdmap_mod.F90 similarity index 100% rename from src/trans/internal/cdmap_mod.F90 rename to src/trans/cpu/internal/cdmap_mod.F90 diff --git a/src/trans/internal/cpledn_mod.F90 b/src/trans/cpu/internal/cpledn_mod.F90 similarity index 100% rename from src/trans/internal/cpledn_mod.F90 rename to src/trans/cpu/internal/cpledn_mod.F90 diff --git a/src/trans/internal/dealloc_resol_mod.F90 b/src/trans/cpu/internal/dealloc_resol_mod.F90 similarity index 100% rename from src/trans/internal/dealloc_resol_mod.F90 rename to src/trans/cpu/internal/dealloc_resol_mod.F90 diff --git a/src/trans/internal/dir_trans_ctl_mod.F90 b/src/trans/cpu/internal/dir_trans_ctl_mod.F90 similarity index 100% rename from src/trans/internal/dir_trans_ctl_mod.F90 rename to src/trans/cpu/internal/dir_trans_ctl_mod.F90 diff --git a/src/trans/internal/dir_trans_ctlad_mod.F90 b/src/trans/cpu/internal/dir_trans_ctlad_mod.F90 similarity index 100% rename from src/trans/internal/dir_trans_ctlad_mod.F90 rename to src/trans/cpu/internal/dir_trans_ctlad_mod.F90 diff --git a/src/trans/internal/dist_grid_32_ctl_mod.F90 b/src/trans/cpu/internal/dist_grid_32_ctl_mod.F90 similarity index 100% rename from src/trans/internal/dist_grid_32_ctl_mod.F90 rename to src/trans/cpu/internal/dist_grid_32_ctl_mod.F90 diff --git a/src/trans/internal/dist_grid_ctl_mod.F90 b/src/trans/cpu/internal/dist_grid_ctl_mod.F90 similarity index 100% rename from src/trans/internal/dist_grid_ctl_mod.F90 rename to src/trans/cpu/internal/dist_grid_ctl_mod.F90 diff --git a/src/trans/internal/dist_spec_control_mod.F90 b/src/trans/cpu/internal/dist_spec_control_mod.F90 similarity index 100% rename from src/trans/internal/dist_spec_control_mod.F90 rename to src/trans/cpu/internal/dist_spec_control_mod.F90 diff --git a/src/trans/internal/ectrans_version_mod.F90.in b/src/trans/cpu/internal/ectrans_version_mod.F90.in similarity index 100% rename from src/trans/internal/ectrans_version_mod.F90.in rename to src/trans/cpu/internal/ectrans_version_mod.F90.in diff --git a/src/trans/internal/eq_regions_mod.F90 b/src/trans/cpu/internal/eq_regions_mod.F90 similarity index 100% rename from src/trans/internal/eq_regions_mod.F90 rename to src/trans/cpu/internal/eq_regions_mod.F90 diff --git a/src/trans/internal/field_split_mod.F90 b/src/trans/cpu/internal/field_split_mod.F90 similarity index 100% rename from src/trans/internal/field_split_mod.F90 rename to src/trans/cpu/internal/field_split_mod.F90 diff --git a/src/trans/internal/fourier_in_mod.F90 b/src/trans/cpu/internal/fourier_in_mod.F90 similarity index 100% rename from src/trans/internal/fourier_in_mod.F90 rename to src/trans/cpu/internal/fourier_in_mod.F90 diff --git a/src/trans/internal/fourier_inad_mod.F90 b/src/trans/cpu/internal/fourier_inad_mod.F90 similarity index 100% rename from src/trans/internal/fourier_inad_mod.F90 rename to src/trans/cpu/internal/fourier_inad_mod.F90 diff --git a/src/trans/internal/fourier_out_mod.F90 b/src/trans/cpu/internal/fourier_out_mod.F90 similarity index 100% rename from src/trans/internal/fourier_out_mod.F90 rename to src/trans/cpu/internal/fourier_out_mod.F90 diff --git a/src/trans/internal/fourier_outad_mod.F90 b/src/trans/cpu/internal/fourier_outad_mod.F90 similarity index 100% rename from src/trans/internal/fourier_outad_mod.F90 rename to src/trans/cpu/internal/fourier_outad_mod.F90 diff --git a/src/trans/internal/fsc_mod.F90 b/src/trans/cpu/internal/fsc_mod.F90 similarity index 100% rename from src/trans/internal/fsc_mod.F90 rename to src/trans/cpu/internal/fsc_mod.F90 diff --git a/src/trans/internal/fscad_mod.F90 b/src/trans/cpu/internal/fscad_mod.F90 similarity index 100% rename from src/trans/internal/fscad_mod.F90 rename to src/trans/cpu/internal/fscad_mod.F90 diff --git a/src/trans/internal/fspgl_int_mod.F90 b/src/trans/cpu/internal/fspgl_int_mod.F90 similarity index 100% rename from src/trans/internal/fspgl_int_mod.F90 rename to src/trans/cpu/internal/fspgl_int_mod.F90 diff --git a/src/trans/internal/ftdir_ctl_mod.F90 b/src/trans/cpu/internal/ftdir_ctl_mod.F90 similarity index 100% rename from src/trans/internal/ftdir_ctl_mod.F90 rename to src/trans/cpu/internal/ftdir_ctl_mod.F90 diff --git a/src/trans/internal/ftdir_ctlad_mod.F90 b/src/trans/cpu/internal/ftdir_ctlad_mod.F90 similarity index 100% rename from src/trans/internal/ftdir_ctlad_mod.F90 rename to src/trans/cpu/internal/ftdir_ctlad_mod.F90 diff --git a/src/trans/internal/ftdir_mod.F90 b/src/trans/cpu/internal/ftdir_mod.F90 similarity index 100% rename from src/trans/internal/ftdir_mod.F90 rename to src/trans/cpu/internal/ftdir_mod.F90 diff --git a/src/trans/internal/ftdirad_mod.F90 b/src/trans/cpu/internal/ftdirad_mod.F90 similarity index 100% rename from src/trans/internal/ftdirad_mod.F90 rename to src/trans/cpu/internal/ftdirad_mod.F90 diff --git a/src/trans/internal/ftinv_ctl_mod.F90 b/src/trans/cpu/internal/ftinv_ctl_mod.F90 similarity index 100% rename from src/trans/internal/ftinv_ctl_mod.F90 rename to src/trans/cpu/internal/ftinv_ctl_mod.F90 diff --git a/src/trans/internal/ftinv_ctlad_mod.F90 b/src/trans/cpu/internal/ftinv_ctlad_mod.F90 similarity index 100% rename from src/trans/internal/ftinv_ctlad_mod.F90 rename to src/trans/cpu/internal/ftinv_ctlad_mod.F90 diff --git a/src/trans/internal/ftinv_mod.F90 b/src/trans/cpu/internal/ftinv_mod.F90 similarity index 100% rename from src/trans/internal/ftinv_mod.F90 rename to src/trans/cpu/internal/ftinv_mod.F90 diff --git a/src/trans/internal/ftinvad_mod.F90 b/src/trans/cpu/internal/ftinvad_mod.F90 similarity index 100% rename from src/trans/internal/ftinvad_mod.F90 rename to src/trans/cpu/internal/ftinvad_mod.F90 diff --git a/src/trans/internal/gath_grid_32_ctl_mod.F90 b/src/trans/cpu/internal/gath_grid_32_ctl_mod.F90 similarity index 100% rename from src/trans/internal/gath_grid_32_ctl_mod.F90 rename to src/trans/cpu/internal/gath_grid_32_ctl_mod.F90 diff --git a/src/trans/internal/gath_grid_ctl_mod.F90 b/src/trans/cpu/internal/gath_grid_ctl_mod.F90 similarity index 100% rename from src/trans/internal/gath_grid_ctl_mod.F90 rename to src/trans/cpu/internal/gath_grid_ctl_mod.F90 diff --git a/src/trans/internal/gath_spec_control_mod.F90 b/src/trans/cpu/internal/gath_spec_control_mod.F90 similarity index 100% rename from src/trans/internal/gath_spec_control_mod.F90 rename to src/trans/cpu/internal/gath_spec_control_mod.F90 diff --git a/src/trans/internal/gawl_mod.F90 b/src/trans/cpu/internal/gawl_mod.F90 similarity index 100% rename from src/trans/internal/gawl_mod.F90 rename to src/trans/cpu/internal/gawl_mod.F90 diff --git a/src/trans/internal/gpnorm_trans_ctl_mod.F90 b/src/trans/cpu/internal/gpnorm_trans_ctl_mod.F90 similarity index 100% rename from src/trans/internal/gpnorm_trans_ctl_mod.F90 rename to src/trans/cpu/internal/gpnorm_trans_ctl_mod.F90 diff --git a/src/trans/internal/inigptr_mod.F90 b/src/trans/cpu/internal/inigptr_mod.F90 similarity index 100% rename from src/trans/internal/inigptr_mod.F90 rename to src/trans/cpu/internal/inigptr_mod.F90 diff --git a/src/trans/internal/inv_trans_ctl_mod.F90 b/src/trans/cpu/internal/inv_trans_ctl_mod.F90 similarity index 100% rename from src/trans/internal/inv_trans_ctl_mod.F90 rename to src/trans/cpu/internal/inv_trans_ctl_mod.F90 diff --git a/src/trans/internal/inv_trans_ctlad_mod.F90 b/src/trans/cpu/internal/inv_trans_ctlad_mod.F90 similarity index 100% rename from src/trans/internal/inv_trans_ctlad_mod.F90 rename to src/trans/cpu/internal/inv_trans_ctlad_mod.F90 diff --git a/src/trans/internal/ldfou2_mod.F90 b/src/trans/cpu/internal/ldfou2_mod.F90 similarity index 100% rename from src/trans/internal/ldfou2_mod.F90 rename to src/trans/cpu/internal/ldfou2_mod.F90 diff --git a/src/trans/internal/ldfou2ad_mod.F90 b/src/trans/cpu/internal/ldfou2ad_mod.F90 similarity index 100% rename from src/trans/internal/ldfou2ad_mod.F90 rename to src/trans/cpu/internal/ldfou2ad_mod.F90 diff --git a/src/trans/internal/ledir_mod.F90 b/src/trans/cpu/internal/ledir_mod.F90 similarity index 100% rename from src/trans/internal/ledir_mod.F90 rename to src/trans/cpu/internal/ledir_mod.F90 diff --git a/src/trans/internal/ledirad_mod.F90 b/src/trans/cpu/internal/ledirad_mod.F90 similarity index 100% rename from src/trans/internal/ledirad_mod.F90 rename to src/trans/cpu/internal/ledirad_mod.F90 diff --git a/src/trans/internal/leinv_mod.F90 b/src/trans/cpu/internal/leinv_mod.F90 similarity index 100% rename from src/trans/internal/leinv_mod.F90 rename to src/trans/cpu/internal/leinv_mod.F90 diff --git a/src/trans/internal/leinvad_mod.F90 b/src/trans/cpu/internal/leinvad_mod.F90 similarity index 100% rename from src/trans/internal/leinvad_mod.F90 rename to src/trans/cpu/internal/leinvad_mod.F90 diff --git a/src/trans/internal/ltdir_ctl_mod.F90 b/src/trans/cpu/internal/ltdir_ctl_mod.F90 similarity index 100% rename from src/trans/internal/ltdir_ctl_mod.F90 rename to src/trans/cpu/internal/ltdir_ctl_mod.F90 diff --git a/src/trans/internal/ltdir_ctlad_mod.F90 b/src/trans/cpu/internal/ltdir_ctlad_mod.F90 similarity index 100% rename from src/trans/internal/ltdir_ctlad_mod.F90 rename to src/trans/cpu/internal/ltdir_ctlad_mod.F90 diff --git a/src/trans/internal/ltdir_mod.F90 b/src/trans/cpu/internal/ltdir_mod.F90 similarity index 100% rename from src/trans/internal/ltdir_mod.F90 rename to src/trans/cpu/internal/ltdir_mod.F90 diff --git a/src/trans/internal/ltdirad_mod.F90 b/src/trans/cpu/internal/ltdirad_mod.F90 similarity index 100% rename from src/trans/internal/ltdirad_mod.F90 rename to src/trans/cpu/internal/ltdirad_mod.F90 diff --git a/src/trans/internal/ltinv_ctl_mod.F90 b/src/trans/cpu/internal/ltinv_ctl_mod.F90 similarity index 100% rename from src/trans/internal/ltinv_ctl_mod.F90 rename to src/trans/cpu/internal/ltinv_ctl_mod.F90 diff --git a/src/trans/internal/ltinv_ctlad_mod.F90 b/src/trans/cpu/internal/ltinv_ctlad_mod.F90 similarity index 100% rename from src/trans/internal/ltinv_ctlad_mod.F90 rename to src/trans/cpu/internal/ltinv_ctlad_mod.F90 diff --git a/src/trans/internal/ltinv_mod.F90 b/src/trans/cpu/internal/ltinv_mod.F90 similarity index 100% rename from src/trans/internal/ltinv_mod.F90 rename to src/trans/cpu/internal/ltinv_mod.F90 diff --git a/src/trans/internal/ltinvad_mod.F90 b/src/trans/cpu/internal/ltinvad_mod.F90 similarity index 100% rename from src/trans/internal/ltinvad_mod.F90 rename to src/trans/cpu/internal/ltinvad_mod.F90 diff --git a/src/trans/internal/myrecvset_mod.F90 b/src/trans/cpu/internal/myrecvset_mod.F90 similarity index 100% rename from src/trans/internal/myrecvset_mod.F90 rename to src/trans/cpu/internal/myrecvset_mod.F90 diff --git a/src/trans/internal/mysendset_mod.F90 b/src/trans/cpu/internal/mysendset_mod.F90 similarity index 100% rename from src/trans/internal/mysendset_mod.F90 rename to src/trans/cpu/internal/mysendset_mod.F90 diff --git a/src/trans/internal/pe2set_mod.F90 b/src/trans/cpu/internal/pe2set_mod.F90 similarity index 100% rename from src/trans/internal/pe2set_mod.F90 rename to src/trans/cpu/internal/pe2set_mod.F90 diff --git a/src/trans/internal/pre_suleg_mod.F90 b/src/trans/cpu/internal/pre_suleg_mod.F90 similarity index 100% rename from src/trans/internal/pre_suleg_mod.F90 rename to src/trans/cpu/internal/pre_suleg_mod.F90 diff --git a/src/trans/internal/prepsnm_mod.F90 b/src/trans/cpu/internal/prepsnm_mod.F90 similarity index 100% rename from src/trans/internal/prepsnm_mod.F90 rename to src/trans/cpu/internal/prepsnm_mod.F90 diff --git a/src/trans/internal/prfi1_mod.F90 b/src/trans/cpu/internal/prfi1_mod.F90 similarity index 100% rename from src/trans/internal/prfi1_mod.F90 rename to src/trans/cpu/internal/prfi1_mod.F90 diff --git a/src/trans/internal/prfi1ad_mod.F90 b/src/trans/cpu/internal/prfi1ad_mod.F90 similarity index 100% rename from src/trans/internal/prfi1ad_mod.F90 rename to src/trans/cpu/internal/prfi1ad_mod.F90 diff --git a/src/trans/internal/prfi1b_mod.F90 b/src/trans/cpu/internal/prfi1b_mod.F90 similarity index 100% rename from src/trans/internal/prfi1b_mod.F90 rename to src/trans/cpu/internal/prfi1b_mod.F90 diff --git a/src/trans/internal/prfi1bad_mod.F90 b/src/trans/cpu/internal/prfi1bad_mod.F90 similarity index 100% rename from src/trans/internal/prfi1bad_mod.F90 rename to src/trans/cpu/internal/prfi1bad_mod.F90 diff --git a/src/trans/internal/prfi2_mod.F90 b/src/trans/cpu/internal/prfi2_mod.F90 similarity index 100% rename from src/trans/internal/prfi2_mod.F90 rename to src/trans/cpu/internal/prfi2_mod.F90 diff --git a/src/trans/internal/prfi2ad_mod.F90 b/src/trans/cpu/internal/prfi2ad_mod.F90 similarity index 100% rename from src/trans/internal/prfi2ad_mod.F90 rename to src/trans/cpu/internal/prfi2ad_mod.F90 diff --git a/src/trans/internal/prfi2b_mod.F90 b/src/trans/cpu/internal/prfi2b_mod.F90 similarity index 100% rename from src/trans/internal/prfi2b_mod.F90 rename to src/trans/cpu/internal/prfi2b_mod.F90 diff --git a/src/trans/internal/prfi2bad_mod.F90 b/src/trans/cpu/internal/prfi2bad_mod.F90 similarity index 100% rename from src/trans/internal/prfi2bad_mod.F90 rename to src/trans/cpu/internal/prfi2bad_mod.F90 diff --git a/src/trans/internal/read_legpol_mod.F90 b/src/trans/cpu/internal/read_legpol_mod.F90 similarity index 100% rename from src/trans/internal/read_legpol_mod.F90 rename to src/trans/cpu/internal/read_legpol_mod.F90 diff --git a/src/trans/internal/set2pe_mod.F90 b/src/trans/cpu/internal/set2pe_mod.F90 similarity index 100% rename from src/trans/internal/set2pe_mod.F90 rename to src/trans/cpu/internal/set2pe_mod.F90 diff --git a/src/trans/internal/set_resol_mod.F90 b/src/trans/cpu/internal/set_resol_mod.F90 similarity index 100% rename from src/trans/internal/set_resol_mod.F90 rename to src/trans/cpu/internal/set_resol_mod.F90 diff --git a/src/trans/internal/setup_dims_mod.F90 b/src/trans/cpu/internal/setup_dims_mod.F90 similarity index 100% rename from src/trans/internal/setup_dims_mod.F90 rename to src/trans/cpu/internal/setup_dims_mod.F90 diff --git a/src/trans/internal/setup_geom_mod.F90 b/src/trans/cpu/internal/setup_geom_mod.F90 similarity index 100% rename from src/trans/internal/setup_geom_mod.F90 rename to src/trans/cpu/internal/setup_geom_mod.F90 diff --git a/src/trans/internal/shuffle_mod.F90 b/src/trans/cpu/internal/shuffle_mod.F90 similarity index 100% rename from src/trans/internal/shuffle_mod.F90 rename to src/trans/cpu/internal/shuffle_mod.F90 diff --git a/src/trans/internal/spnorm_ctl_mod.F90 b/src/trans/cpu/internal/spnorm_ctl_mod.F90 similarity index 100% rename from src/trans/internal/spnorm_ctl_mod.F90 rename to src/trans/cpu/internal/spnorm_ctl_mod.F90 diff --git a/src/trans/internal/spnormc_mod.F90 b/src/trans/cpu/internal/spnormc_mod.F90 similarity index 100% rename from src/trans/internal/spnormc_mod.F90 rename to src/trans/cpu/internal/spnormc_mod.F90 diff --git a/src/trans/internal/spnormd_mod.F90 b/src/trans/cpu/internal/spnormd_mod.F90 similarity index 100% rename from src/trans/internal/spnormd_mod.F90 rename to src/trans/cpu/internal/spnormd_mod.F90 diff --git a/src/trans/internal/spnsde_mod.F90 b/src/trans/cpu/internal/spnsde_mod.F90 similarity index 100% rename from src/trans/internal/spnsde_mod.F90 rename to src/trans/cpu/internal/spnsde_mod.F90 diff --git a/src/trans/internal/spnsdead_mod.F90 b/src/trans/cpu/internal/spnsdead_mod.F90 similarity index 100% rename from src/trans/internal/spnsdead_mod.F90 rename to src/trans/cpu/internal/spnsdead_mod.F90 diff --git a/src/trans/internal/sugaw_mod.F90 b/src/trans/cpu/internal/sugaw_mod.F90 similarity index 100% rename from src/trans/internal/sugaw_mod.F90 rename to src/trans/cpu/internal/sugaw_mod.F90 diff --git a/src/trans/internal/suleg_mod.F90 b/src/trans/cpu/internal/suleg_mod.F90 similarity index 100% rename from src/trans/internal/suleg_mod.F90 rename to src/trans/cpu/internal/suleg_mod.F90 diff --git a/src/trans/internal/sump_trans0_mod.F90 b/src/trans/cpu/internal/sump_trans0_mod.F90 similarity index 100% rename from src/trans/internal/sump_trans0_mod.F90 rename to src/trans/cpu/internal/sump_trans0_mod.F90 diff --git a/src/trans/internal/sump_trans_mod.F90 b/src/trans/cpu/internal/sump_trans_mod.F90 similarity index 100% rename from src/trans/internal/sump_trans_mod.F90 rename to src/trans/cpu/internal/sump_trans_mod.F90 diff --git a/src/trans/internal/sump_trans_preleg_mod.F90 b/src/trans/cpu/internal/sump_trans_preleg_mod.F90 similarity index 100% rename from src/trans/internal/sump_trans_preleg_mod.F90 rename to src/trans/cpu/internal/sump_trans_preleg_mod.F90 diff --git a/src/trans/internal/sumplat_mod.F90 b/src/trans/cpu/internal/sumplat_mod.F90 similarity index 100% rename from src/trans/internal/sumplat_mod.F90 rename to src/trans/cpu/internal/sumplat_mod.F90 diff --git a/src/trans/internal/sumplatb_mod.F90 b/src/trans/cpu/internal/sumplatb_mod.F90 similarity index 100% rename from src/trans/internal/sumplatb_mod.F90 rename to src/trans/cpu/internal/sumplatb_mod.F90 diff --git a/src/trans/internal/sumplatbeq_mod.F90 b/src/trans/cpu/internal/sumplatbeq_mod.F90 similarity index 100% rename from src/trans/internal/sumplatbeq_mod.F90 rename to src/trans/cpu/internal/sumplatbeq_mod.F90 diff --git a/src/trans/internal/sumplatf_mod.F90 b/src/trans/cpu/internal/sumplatf_mod.F90 similarity index 100% rename from src/trans/internal/sumplatf_mod.F90 rename to src/trans/cpu/internal/sumplatf_mod.F90 diff --git a/src/trans/internal/supol_mod.F90 b/src/trans/cpu/internal/supol_mod.F90 similarity index 100% rename from src/trans/internal/supol_mod.F90 rename to src/trans/cpu/internal/supol_mod.F90 diff --git a/src/trans/internal/supolf_mod.F90 b/src/trans/cpu/internal/supolf_mod.F90 similarity index 100% rename from src/trans/internal/supolf_mod.F90 rename to src/trans/cpu/internal/supolf_mod.F90 diff --git a/src/trans/internal/sustaonl_mod.F90 b/src/trans/cpu/internal/sustaonl_mod.F90 similarity index 100% rename from src/trans/internal/sustaonl_mod.F90 rename to src/trans/cpu/internal/sustaonl_mod.F90 diff --git a/src/trans/internal/sutrle_mod.F90 b/src/trans/cpu/internal/sutrle_mod.F90 similarity index 100% rename from src/trans/internal/sutrle_mod.F90 rename to src/trans/cpu/internal/sutrle_mod.F90 diff --git a/src/trans/internal/suwavedi_mod.F90 b/src/trans/cpu/internal/suwavedi_mod.F90 similarity index 100% rename from src/trans/internal/suwavedi_mod.F90 rename to src/trans/cpu/internal/suwavedi_mod.F90 diff --git a/src/trans/internal/tpm_constants.F90 b/src/trans/cpu/internal/tpm_constants.F90 similarity index 100% rename from src/trans/internal/tpm_constants.F90 rename to src/trans/cpu/internal/tpm_constants.F90 diff --git a/src/trans/internal/tpm_ctl.F90 b/src/trans/cpu/internal/tpm_ctl.F90 similarity index 100% rename from src/trans/internal/tpm_ctl.F90 rename to src/trans/cpu/internal/tpm_ctl.F90 diff --git a/src/trans/internal/tpm_dim.F90 b/src/trans/cpu/internal/tpm_dim.F90 similarity index 100% rename from src/trans/internal/tpm_dim.F90 rename to src/trans/cpu/internal/tpm_dim.F90 diff --git a/src/trans/internal/tpm_distr.F90 b/src/trans/cpu/internal/tpm_distr.F90 similarity index 100% rename from src/trans/internal/tpm_distr.F90 rename to src/trans/cpu/internal/tpm_distr.F90 diff --git a/src/trans/internal/tpm_fftw.F90 b/src/trans/cpu/internal/tpm_fftw.F90 similarity index 100% rename from src/trans/internal/tpm_fftw.F90 rename to src/trans/cpu/internal/tpm_fftw.F90 diff --git a/src/trans/internal/tpm_fields.F90 b/src/trans/cpu/internal/tpm_fields.F90 similarity index 100% rename from src/trans/internal/tpm_fields.F90 rename to src/trans/cpu/internal/tpm_fields.F90 diff --git a/src/trans/internal/tpm_flt.F90 b/src/trans/cpu/internal/tpm_flt.F90 similarity index 100% rename from src/trans/internal/tpm_flt.F90 rename to src/trans/cpu/internal/tpm_flt.F90 diff --git a/src/trans/internal/tpm_gen.F90 b/src/trans/cpu/internal/tpm_gen.F90 similarity index 100% rename from src/trans/internal/tpm_gen.F90 rename to src/trans/cpu/internal/tpm_gen.F90 diff --git a/src/trans/internal/tpm_geometry.F90 b/src/trans/cpu/internal/tpm_geometry.F90 similarity index 100% rename from src/trans/internal/tpm_geometry.F90 rename to src/trans/cpu/internal/tpm_geometry.F90 diff --git a/src/trans/internal/tpm_pol.F90 b/src/trans/cpu/internal/tpm_pol.F90 similarity index 100% rename from src/trans/internal/tpm_pol.F90 rename to src/trans/cpu/internal/tpm_pol.F90 diff --git a/src/trans/internal/tpm_trans.F90 b/src/trans/cpu/internal/tpm_trans.F90 similarity index 100% rename from src/trans/internal/tpm_trans.F90 rename to src/trans/cpu/internal/tpm_trans.F90 diff --git a/src/trans/internal/trgtol_mod.F90 b/src/trans/cpu/internal/trgtol_mod.F90 similarity index 100% rename from src/trans/internal/trgtol_mod.F90 rename to src/trans/cpu/internal/trgtol_mod.F90 diff --git a/src/trans/internal/trltog_mod.F90 b/src/trans/cpu/internal/trltog_mod.F90 similarity index 100% rename from src/trans/internal/trltog_mod.F90 rename to src/trans/cpu/internal/trltog_mod.F90 diff --git a/src/trans/internal/trltom_mod.F90 b/src/trans/cpu/internal/trltom_mod.F90 similarity index 100% rename from src/trans/internal/trltom_mod.F90 rename to src/trans/cpu/internal/trltom_mod.F90 diff --git a/src/trans/internal/trmtol_mod.F90 b/src/trans/cpu/internal/trmtol_mod.F90 similarity index 100% rename from src/trans/internal/trmtol_mod.F90 rename to src/trans/cpu/internal/trmtol_mod.F90 diff --git a/src/trans/internal/updsp_mod.F90 b/src/trans/cpu/internal/updsp_mod.F90 similarity index 100% rename from src/trans/internal/updsp_mod.F90 rename to src/trans/cpu/internal/updsp_mod.F90 diff --git a/src/trans/internal/updspad_mod.F90 b/src/trans/cpu/internal/updspad_mod.F90 similarity index 100% rename from src/trans/internal/updspad_mod.F90 rename to src/trans/cpu/internal/updspad_mod.F90 diff --git a/src/trans/internal/updspb_mod.F90 b/src/trans/cpu/internal/updspb_mod.F90 similarity index 100% rename from src/trans/internal/updspb_mod.F90 rename to src/trans/cpu/internal/updspb_mod.F90 diff --git a/src/trans/internal/updspbad_mod.F90 b/src/trans/cpu/internal/updspbad_mod.F90 similarity index 100% rename from src/trans/internal/updspbad_mod.F90 rename to src/trans/cpu/internal/updspbad_mod.F90 diff --git a/src/trans/internal/uvtvd_mod.F90 b/src/trans/cpu/internal/uvtvd_mod.F90 similarity index 100% rename from src/trans/internal/uvtvd_mod.F90 rename to src/trans/cpu/internal/uvtvd_mod.F90 diff --git a/src/trans/internal/uvtvdad_mod.F90 b/src/trans/cpu/internal/uvtvdad_mod.F90 similarity index 100% rename from src/trans/internal/uvtvdad_mod.F90 rename to src/trans/cpu/internal/uvtvdad_mod.F90 diff --git a/src/trans/internal/vd2uv_ctl_mod.F90 b/src/trans/cpu/internal/vd2uv_ctl_mod.F90 similarity index 100% rename from src/trans/internal/vd2uv_ctl_mod.F90 rename to src/trans/cpu/internal/vd2uv_ctl_mod.F90 diff --git a/src/trans/internal/vd2uv_mod.F90 b/src/trans/cpu/internal/vd2uv_mod.F90 similarity index 100% rename from src/trans/internal/vd2uv_mod.F90 rename to src/trans/cpu/internal/vd2uv_mod.F90 diff --git a/src/trans/internal/vdtuv_mod.F90 b/src/trans/cpu/internal/vdtuv_mod.F90 similarity index 100% rename from src/trans/internal/vdtuv_mod.F90 rename to src/trans/cpu/internal/vdtuv_mod.F90 diff --git a/src/trans/internal/vdtuvad_mod.F90 b/src/trans/cpu/internal/vdtuvad_mod.F90 similarity index 100% rename from src/trans/internal/vdtuvad_mod.F90 rename to src/trans/cpu/internal/vdtuvad_mod.F90 diff --git a/src/trans/internal/write_legpol_mod.F90 b/src/trans/cpu/internal/write_legpol_mod.F90 similarity index 100% rename from src/trans/internal/write_legpol_mod.F90 rename to src/trans/cpu/internal/write_legpol_mod.F90 diff --git a/src/trans/maybe_unused/external/sugawc.F90 b/src/trans/cpu/maybe_unused/external/sugawc.F90 old mode 100644 new mode 100755 similarity index 100% rename from src/trans/maybe_unused/external/sugawc.F90 rename to src/trans/cpu/maybe_unused/external/sugawc.F90 diff --git a/src/trans/maybe_unused/include/ectrans/sugawc.h b/src/trans/cpu/maybe_unused/include/ectrans/sugawc.h similarity index 100% rename from src/trans/maybe_unused/include/ectrans/sugawc.h rename to src/trans/cpu/maybe_unused/include/ectrans/sugawc.h diff --git a/src/trans/sedrenames.txt b/src/trans/cpu/sedrenames.txt similarity index 100% rename from src/trans/sedrenames.txt rename to src/trans/cpu/sedrenames.txt diff --git a/src/trans/sharedmem/sharedmem.c b/src/trans/cpu/sharedmem/sharedmem.c similarity index 100% rename from src/trans/sharedmem/sharedmem.c rename to src/trans/cpu/sharedmem/sharedmem.c diff --git a/src/trans/sharedmem/sharedmem_mod.F90 b/src/trans/cpu/sharedmem/sharedmem_mod.F90 similarity index 100% rename from src/trans/sharedmem/sharedmem_mod.F90 rename to src/trans/cpu/sharedmem/sharedmem_mod.F90 diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt new file mode 100644 index 00000000..c6c0daae --- /dev/null +++ b/src/trans/gpu/CMakeLists.txt @@ -0,0 +1,132 @@ +# (C) Copyright 2020- ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +# Preprocess module file containing version information +configure_file( internal/ectrans_version_mod.F90.in internal/ectrans_version_mod.F90 ) + +## Apply workarounds for some known compilers + +if(CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC") + + # Compile setup_trans with pinned memory to improve data movement performance. + ectrans_add_compile_options( + SOURCES external/setup_trans.F90 + #FLAGS "-gpu=pinned,deepcopy,fastmath,nordc") + FLAGS "-gpu=pinned,fastmath") + # TODO: check if it is sufficient to only set "-gpu=pinned" which appends rather than overwrites + +endif() + +## Assemble sources + +ecbuild_list_add_pattern( LIST trans_src + GLOB + sharedmem/* + algor/* + internal/* + external/* + ${CMAKE_CURRENT_BINARY_DIR}/internal/ectrans_version_mod.F90 + QUIET + ) + +ecbuild_list_exclude_pattern( LIST trans_src REGEX dilatation_mod.F90 ) + +#if( NOT ${CMAKE_BUILD_TYPE_CAPS} STREQUAL DEBUG ) + set_source_files_properties( internal/ftinv_mod.F90 PROPERTIES COMPILE_OPTIONS "-O2" ) + ecbuild_info("warn: special compile flags ftinv_mod.F90") + set_source_files_properties( internal/ftdir_mod.F90 PROPERTIES COMPILE_OPTIONS "-O2" ) + ecbuild_info("warn: special compile flags ftdir_mod.F90") +#endif() + +# Filter source list according to available GPU runtime +if( HAVE_HIP ) + set( GPU_RUNTIME "HIP" ) + ectrans_declare_hip_sources( SOURCES_GLOB + sharedmem/*.hip.cpp + algor/*.hip.cpp + internal/*.hip.cpp + external/*.hip.cpp + ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX \.cu$ ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX cuda_device_mod.F90 ) +elseif( HAVE_CUDA ) + set( GPU_RUNTIME "CUDA" ) + set( ECTRANS_GPU_HIP_LIBRARIES CUDA::cufft CUDA::cublas nvhpcwrapnvtx CUDA::cudart ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX \.hip\.cpp ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX hip_device_mod.F90 ) +else() + ecbuild_info("warn: HIP and CUDA not found") +endif() + +foreach( prec dp sp ) + if( HAVE_${prec} ) + set( GPU_LIBRARY_TYPE SHARED ) + if( HAVE_GPU_STATIC ) + set( GPU_LIBRARY_TYPE STATIC ) + endif() + + ecbuild_add_library( + TARGET trans_gpu_${prec} + TYPE ${GPU_LIBRARY_TYPE} + SOURCES ${trans_src} + LINKER_LANGUAGE Fortran + PUBLIC_INCLUDES $ + $ + $ + $ + $ + PUBLIC_LIBS parkind_${prec} + fiat + PRIVATE_LIBS ${ECTRANS_GPU_HIP_LIBRARIES} + ${LAPACK_LIBRARIES} # we still have symbols in some files + $<${HAVE_ACC}:OpenACC::OpenACC_Fortran> + $<${HAVE_OMP}:OpenMP::OpenMP_Fortran> + $<${HAVE_MPI}:MPI::MPI_Fortran> + $<${HAVE_CUTLASS}:nvidia::cutlass::cutlass> + PRIVATE_DEFINITIONS ${GPU_OFFLOAD}GPU + ${GPU_RUNTIME}GPU + $<${HAVE_GPU_AWARE_MPI}:USE_GPU_AWARE_MPI> + $<${HAVE_CUTLASS}:USE_CUTLASS> + $<${HAVE_CUTLASS_3XTF32}:USE_CUTLASS_3XTF32> + $<${HAVE_GPU_GRAPHS_GEMM}:USE_GRAPHS_GEMM> + ) + + ectrans_target_fortran_module_directory( + TARGET trans_gpu_${prec} + MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/module/trans_gpu_${prec} + INSTALL_DIRECTORY module/trans_gpu_${prec} + ) + + if( prec STREQUAL sp ) + target_compile_definitions( trans_gpu_${prec} PRIVATE TRANS_SINGLE PARKINDTRANS_SINGLE ) + endif() + + if( HAVE_OMP AND CMAKE_Fortran_COMPILER_ID MATCHES Cray ) + # Propagate flags as link options for downstream targets. Only required for Cray + target_link_options( trans_gpu_${prec} INTERFACE + $<$:SHELL:${OpenMP_Fortran_FLAGS}> + $<$:SHELL:${OpenMP_Fortran_FLAGS}> + $<$:SHELL:${OpenMP_Fortran_FLAGS}> ) + endif() + + if( HAVE_ACC AND CMAKE_Fortran_COMPILER_ID MATCHES NVHPC ) + # Propagate flags as link options for downstream targets. Only required for NVHPC + target_link_options( trans_gpu_${prec} INTERFACE + $<$:SHELL:${OpenACC_Fortran_FLAGS} -gpu=pinned> + $<$:SHELL:${OpenACC_Fortran_FLAGS} -gpu=pinned> + $<$:SHELL:${OpenACC_Fortran_FLAGS} -gpu=pinned> ) + endif() + + ## Install trans_gpu_${prec} interface + file( GLOB trans_interface ${PROJECT_SOURCE_DIR}/src/trans/include/ectrans/* ) + install( + FILES ${trans_interface} + DESTINATION include/ectrans/trans_gpu_${prec} + ) + endif() +endforeach() diff --git a/src/trans/gpu/algor/c_hipmemgetinfo.cpp b/src/trans/gpu/algor/c_hipmemgetinfo.cpp new file mode 100644 index 00000000..4e294e5b --- /dev/null +++ b/src/trans/gpu/algor/c_hipmemgetinfo.cpp @@ -0,0 +1,23 @@ +#include "hicblas.h" + + +extern "C" { +hipError_t c_hipmemgetinfo( int *meg_free, int *meg_total) +{ + + size_t l_free = 0; + size_t l_total = 0; + + hipError_t error_memgetinfo; + error_memgetinfo = hipMemGetInfo(&l_free, &l_total); + + long long ll_free = (long long) l_free; + long long ll_total = (long long) l_total; + + *meg_free = (int) (ll_free / 1048576); + *meg_total = (int) (ll_total / 1048576); + + return error_memgetinfo; +} + +} diff --git a/src/trans/gpu/algor/device_mod.F90 b/src/trans/gpu/algor/device_mod.F90 new file mode 100644 index 00000000..ad2c82da --- /dev/null +++ b/src/trans/gpu/algor/device_mod.F90 @@ -0,0 +1,82 @@ +! (C) Copyright 2020- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE DEVICE_MOD + +#ifdef CUDAGPU +#define hipDeviceSynchronize cudaDeviceSynchronize +#define hipStreamSynchronize cudaStreamSynchronize +#define hipStreamDestroy cudaStreamDestroy +#define hipSetDevice cudaSetDevice +#define hipGetDevice cudaGetDevice +#define hipGetDeviceCount cudaGetDeviceCount +#endif + +INTERFACE DEVICE_SYNC + +INTEGER FUNCTION DEVICE_SYNCHRONIZE() BIND(C, NAME='hipDeviceSynchronize') +END FUNCTION DEVICE_SYNCHRONIZE + +END INTERFACE DEVICE_SYNC + +INTERFACE DEVICESTREAMSYNC + +INTEGER FUNCTION DEVICE_STREAM_SYNCHRONIZE(STREAM) BIND(C, NAME='hipStreamSynchronize') +USE ISO_C_BINDING, ONLY: C_PTR +TYPE(C_PTR) :: STREAM +END FUNCTION DEVICE_STREAM_SYNCHRONIZE + +END INTERFACE DEVICESTREAMSYNC + +INTERFACE DEVICESTREAMDESTROY + +INTEGER FUNCTION DEVICE_STREAM_DESTROY(STREAM) BIND(C, NAME='hipStreamDestroy') +USE ISO_C_BINDING, ONLY: C_PTR +TYPE(C_PTR) :: STREAM +END FUNCTION DEVICE_STREAM_DESTROY + +END INTERFACE DEVICESTREAMDESTROY + +INTERFACE DEVICESETDEVICE + +INTEGER FUNCTION DEVICE_SETDEVICE(DEVNUM) BIND(C, NAME='hipSetDevice') +USE ISO_C_BINDING, ONLY: C_INT +INTEGER(C_INT), VALUE :: DEVNUM +END FUNCTION DEVICE_SETDEVICE + +END INTERFACE DEVICESETDEVICE + +INTERFACE DEVICEGETDEVICE + +INTEGER FUNCTION DEVICE_GETDEVICE(DEVNUM) BIND(C, NAME='hipGetDevice') +USE ISO_C_BINDING, ONLY: C_INT +INTEGER(C_INT) :: DEVNUM +END FUNCTION DEVICE_GETDEVICE + +END INTERFACE DEVICEGETDEVICE + +INTERFACE DEVICEGETDEVICECOUNT + +INTEGER FUNCTION DEVICE_GETDEVICECOUNT(DEVNUM) BIND(C, NAME='hipGetDeviceCount') +USE ISO_C_BINDING, ONLY: C_INT +INTEGER(C_INT) :: DEVNUM +END FUNCTION DEVICE_GETDEVICECOUNT + +END INTERFACE DEVICEGETDEVICECOUNT + +INTERFACE DEVICEGETMEMINFO + +INTEGER FUNCTION DEVICE_MEMGETINFO(MEMFREE_MB, MEMTOTAL_MB) BIND(C, NAME='c_hipmemgetinfo') +USE ISO_C_BINDING, ONLY: C_INT +INTEGER(C_INT) :: MEMFREE_MB, MEMTOTAL_MB +END FUNCTION DEVICE_MEMGETINFO + +END INTERFACE DEVICEGETMEMINFO + +END MODULE DEVICE_MOD diff --git a/src/trans/gpu/algor/growing_allocator.h b/src/trans/gpu/algor/growing_allocator.h new file mode 100644 index 00000000..e236d973 --- /dev/null +++ b/src/trans/gpu/algor/growing_allocator.h @@ -0,0 +1,4 @@ +#pragma once + +extern "C" void growing_allocator_register_free_c(void *, + void (&)(float *, size_t)); diff --git a/src/trans/gpu/algor/hicblas.h b/src/trans/gpu/algor/hicblas.h new file mode 100644 index 00000000..42ef5fe4 --- /dev/null +++ b/src/trans/gpu/algor/hicblas.h @@ -0,0 +1,37 @@ + // (C) Copyright 2000- ECMWF. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + +// HIC--->BLAS +// hip +// cuda +// +// Common header to provide abstraction layer to utilize hipblas and cublas from +// common wrapper calls. Runtime and library specific implementations are pulled +// in from bespoke header files. +// + +#ifndef __HICBLAS_H__ +#define __HICBLAS_H__ + +#ifdef HIPGPU +#include "hicblas_hip.h" +#elif defined(CUDAGPU) +#include "hicblas_cuda.h" +#endif + +inline void _printError(const char * component, const char * file, const int line, int err, const char * err_str) { + fprintf(stderr, "%s error at 1\n", component); + fprintf(stderr, "%s error in file '%s'\n", component, file); + fprintf(stderr, "%s error at 2\n", component); + fprintf(stderr, "%s error line '%d'\n", component, line); + fprintf(stderr, "%s error at 3\n", component); + fprintf(stderr, "%s error %d: %s\nterminating!\n", component, err, err_str); + return; +} + +#endif diff --git a/src/trans/gpu/algor/hicblas_cuda.h b/src/trans/gpu/algor/hicblas_cuda.h new file mode 100644 index 00000000..f7997cd9 --- /dev/null +++ b/src/trans/gpu/algor/hicblas_cuda.h @@ -0,0 +1,125 @@ +// (C) Copyright 2000- ECMWF. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + +// Include cublas header and provide CPP macros to rewrite HIP and hipblas names +// to CUDA and cublas names + +#ifndef __HICBLAS_CUDA_H__ +#define __HICBLAS_CUDA_H__ + +#include "cublas_v2.h" +#include +#include + +// Library name +#define hipblas cublas +#define HIPBLAS CUBLAS + +// CPP definitions +#define HIPBLAS_OP_T CUBLAS_OP_T +#define HIPBLAS_OP_N CUBLAS_OP_N +#define HIPBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS + +// Data types +#define hipError_t cudaError_t +#define hipStream_t cudaStream_t +#define hipblasHandle_t cublasHandle_t +#define hipblasStatus_t cublasStatus_t +#define hipblasOperation_t cublasOperation_t +#define hipGraph_t cudaGraph_t +#define hipGraphNode_t cudaGraphNode_t +#define hipGraphExec_t cudaGraphExec_t + +// Constants +#define hipMemcpyHostToDevice cudaMemcpyHostToDevice +#define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost + +// Library calls +#define hipblasCreate cublasCreate +#define hipblasDestroy cublasDestroy +#define hipblasDgemm cublasDgemm +#define hipblasSgemm cublasSgemm +#define hipblasDgemmBatched cublasDgemmBatched +#define hipblasSgemmBatched cublasSgemmBatched +#define hipblasDgemmStridedBatched cublasDgemmStridedBatched +#define hipblasSgemmStridedBatched cublasSgemmStridedBatched +#define hipblasSetStream cublasSetStream + +#define hipGraphExecDestroy cudaGraphExecDestroy +#define hipGraphCreate cudaGraphCreate +#define hipGraphDestroy cudaGraphDestroy +#define hipGraphLaunch cudaGraphLaunch +#define hipGraphInstantiate cudaGraphInstantiate +#define hipGraphAddChildGraphNode cudaGraphAddChildGraphNode +#define hipStreamCreate cudaStreamCreate +#define hipStreamDestroy cudaStreamDestroy +#define hipStreamCaptureModeGlobal cudaStreamCaptureModeGlobal +#define hipStreamBeginCapture cudaStreamBeginCapture +#define hipStreamEndCapture cudaStreamEndCapture + +// Runtime calls +#define hipHostMalloc(PTR, SIZE, FLAGS) cudaMallocHost(PTR, SIZE) +#define hipMalloc cudaMalloc +#define hipFree cudaFree +#define hipMemcpy cudaMemcpy +#define hipDeviceSynchronize cudaDeviceSynchronize +#define hipMemGetInfo cudaMemGetInfo + +inline static const char * _blasGetErrorEnum(cublasStatus_t error) +{ + switch (error) + { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + + return ""; +} + +#define HIC_CHECK(e) \ + { \ + cudaError_t err = (e); \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error: %s, line %d, %s: %s\n", __FILE__, __LINE__, \ + #e, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define HICBLAS_CHECK(e) \ +{ \ + cublasStatus_t err = (e); \ + if (err != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "CUDA error: %s, line %d, %s: %s\n", __FILE__, __LINE__, \ + #e, _blasGetErrorEnum(err)); \ + exit(EXIT_FAILURE); \ + } \ +} + +#endif diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h new file mode 100644 index 00000000..a0bd6dd1 --- /dev/null +++ b/src/trans/gpu/algor/hicblas_cutlass.cuda.h @@ -0,0 +1,204 @@ +#ifdef USE_CUTLASS +//#include "hicblas.h" +#include "cutlass/gemm/device/gemm.h" + +#define CUTLASS_CHECK(e) \ + { \ + cutlass::Status err = (e); \ + if (err != cutlass::Status::kSuccess) { \ + fprintf(stderr, "CUTLASS error: %s, line %d, %s: %i\n", __FILE__, \ + __LINE__, #e, (int)err); \ + exit(EXIT_FAILURE); \ + } \ + } + +#ifdef USE_CUTLASS_3XTF32 +constexpr bool use_3xtf32 = true; +#else +constexpr bool use_3xtf32 = false; +#endif + + +template +CutlassGemm &get_cutlass_handle() { + static auto handle = std::make_unique(); + return *handle; +} + +namespace detail { + +enum class CutlassType { cutlass_3xtf32, cutlass_fp32 }; + +template +class cutlass_sgemm_grouped; + +template +class cutlass_sgemm_grouped { + // this was verified using Ampere and uses 3XTF32 + static constexpr int AlignmentA = 4; + static constexpr int AlignmentB = 4; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using OperatorClass = cutlass::arch::OpClassTensorOp; + using MyOp = cutlass::arch::OpMultiplyAddFastF32; + + using Gemm = cutlass::gemm::device::Gemm< + float, + std::conditional_t, // + float, + std::conditional_t, // + float, cutlass::layout::ColumnMajor, // + float, // + OperatorClass, cutlass::arch::Sm80, // + ThreadblockShape, WarpShape, InstructionShape, // + cutlass::epilogue::thread::LinearCombination< // + float, // + 128 / cutlass::sizeof_bits::value, + float, // + float // + >, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, // + 3, // + AlignmentA, // + AlignmentB, // + true, // + MyOp // + >; + static constexpr int sz_align = 8; + +public: + void operator()(cudaStream_t stream, int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, + float *C, int ldc) const { + auto &gemm_op = get_cutlass_handle(); + CUTLASS_CHECK(gemm_op( + {// + {(m + sz_align - 1) / sz_align * sz_align, + (n + sz_align - 1) / sz_align * sz_align, + (k + sz_align - 1) / sz_align * sz_align}, + {const_cast(A), lda}, + {const_cast(B), ldb}, + {C, ldc}, + {C, ldc}, + {alpha, beta}}, + nullptr, stream)); + } +}; +template +class cutlass_sgemm_grouped { + // this was verified using Volta and uses FP32 + static constexpr int AlignmentA = 1; + static constexpr int AlignmentB = 1; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + using OperatorClass = cutlass::arch::OpClassSimt; + using MyOp = cutlass::arch::OpMultiplyAdd; + + using Gemm = cutlass::gemm::device::Gemm< + float, // + std::conditional_t, // + float, // + std::conditional_t, // + float, cutlass::layout::ColumnMajor, // + float, // + OperatorClass, cutlass::arch::Sm70, // + ThreadblockShape, WarpShape, InstructionShape, // + cutlass::epilogue::thread::LinearCombination< // + float, // + 1, // + float, // + float // + >, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, // + 2, // + AlignmentA, // + AlignmentB, // + true, // + MyOp // + >; + static constexpr int sz_align = 1; + +public: + void operator()(cudaStream_t stream, int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, + float *C, int ldc) const { + auto &gemm_op = get_cutlass_handle(); + CUTLASS_CHECK(gemm_op( + {// + {(m + sz_align - 1) / sz_align * sz_align, + (n + sz_align - 1) / sz_align * sz_align, + (k + sz_align - 1) / sz_align * sz_align}, + {const_cast(A), lda}, + {const_cast(B), ldb}, + {C, ldc}, + {C, ldc}, + {alpha, beta}}, + nullptr, stream)); + } +}; + +} // namespace detail +template +void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, + float alpha, const float *A, int lda, + int *offsetsA, const float *B, int ldb, + int *offsetsB, float beta, float *C, + int ldc, int *offsetsC, int batchCount, + cudaStream_t stream, + void *growing_allocator) { + using namespace detail; + int device; + HIC_CHECK(cudaGetDevice(&device)); + int capability_major; + HIC_CHECK(cudaDeviceGetAttribute(&capability_major, + cudaDevAttrComputeCapabilityMajor, device)); + if (capability_major >= 8 && use_3xtf32) + run_group_graph(cutlass_sgemm_grouped(), + m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount, stream, blas_id, + growing_allocator); + else + run_group_graph(cutlass_sgemm_grouped(), + m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount, stream, blas_id, + growing_allocator); +} + +void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb, + int m, int *n, int *k, float alpha, + const float *A, int lda, int *offsetsA, + const float *B, int ldb, int *offsetsB, float beta, + float *C, int ldc, int *offsetsC, + int batchCount, cudaStream_t stream, + void *growing_allocator) { + + if (transa == 'N' && transb == 'N') + cutlass_sgemm_wrapper_grouped_op( + blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount, stream, growing_allocator); + else if (transa == 'N' && transb == 'T') + cutlass_sgemm_wrapper_grouped_op( + blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount, stream, growing_allocator); + else if (transa == 'T' && transb == 'N') + cutlass_sgemm_wrapper_grouped_op( + blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount, stream, growing_allocator); + else if (transa == 'T' && transb == 'T') + cutlass_sgemm_wrapper_grouped_op( + blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount, stream, growing_allocator); + else + assert(false); +} +//} + +#endif diff --git a/src/trans/gpu/algor/hicblas_gemm.cuda.cu b/src/trans/gpu/algor/hicblas_gemm.cuda.cu new file mode 120000 index 00000000..365279e6 --- /dev/null +++ b/src/trans/gpu/algor/hicblas_gemm.cuda.cu @@ -0,0 +1 @@ +hicblas_gemm.hip.cpp \ No newline at end of file diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp new file mode 100644 index 00000000..34a68c4e --- /dev/null +++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp @@ -0,0 +1,353 @@ +// (C) Copyright 2000- ECMWF. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + +#include +#include + +#include +#include +#include +#include + +#include "hicblas.h" +#ifdef USE_CUTLASS +#include "cutlass/gemm/device/gemm.h" +#endif + +#include "growing_allocator.h" + + +bool hip_alreadyAllocated_sgemm=false; +bool hip_alreadyAllocated_sgemm_handle=false; + +bool hip_alreadyAllocated_dsgemm=false; +bool hip_alreadyAllocated_dgemm_handle=false; + +hipblasHandle_t handle_hip_sgemm; +hipblasHandle_t handle_hip_dgemm; + + +namespace { +namespace detail { +struct pair_hash { + std::size_t operator()(const std::pair &p) const { + return p.first * 10000 + p.second; + } +}; +} // namespace detail + +template auto &get_graph_cache() { + // we store at most one graph per "m" (# fields) and "blas id" + static std::unordered_map, hipGraphExec_t, + detail::pair_hash> + graphCache; + return graphCache; +} +template auto &get_ptr_cache() { + static std::unordered_map< + std::pair, std::tuple, + detail::pair_hash> + ptrCache; + return ptrCache; +} + +template void free_gemm_cache(float *, size_t) { + get_graph_cache().clear(); + get_ptr_cache().clear(); +} + +// this version is using graphs and caches the graphs +template +void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, + const Real *A, int lda, int *offsetsA, const Real *B, + int ldb, int *offsetsB, Real beta, Real *C, int ldc, + int *offsetsC, int batchCount, hipStream_t stream, + int blas_id, void *growing_allocator) { + growing_allocator_register_free_c(growing_allocator, + free_gemm_cache); + + // we store at most one graph per "m" (# fields) and "blas id" + auto &graphCache = get_graph_cache(); + + // we also store A, B, and C and recreate the graph if they change + auto &ptrCache = get_ptr_cache(); + + auto key = std::make_pair(m, blas_id); + + auto ptrs = ptrCache.find(key); + if (ptrs != ptrCache.end() && + (std::get<0>(ptrs->second) != A || std::get<1>(ptrs->second) != B || + std::get<2>(ptrs->second) != C)) { + // the plan is cached, but the pointers are not correct. we remove and + // delete the graph, but we keep the hipblas handles, if this happens more + // often, we should cache this... + std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." << std::endl; + std::cout << "We have an entry with key {m=" << m << ", blas_id=" << blas_id + << "}\n"; + std::cout << "Pointers: " << std::get<0>(ptrs->second) << ", " + << std::get<1>(ptrs->second) << ", " << std::get<2>(ptrs->second) + << " vs. " << A << ", " << B << ", " << C << std::endl; + HIC_CHECK(hipGraphExecDestroy(graphCache[key])); + graphCache.erase(key); + ptrCache.erase(key); + } + + auto graph = graphCache.find(key); + if (graph == graphCache.end()) { + // this graph does not exist yet + hipStream_t stream; + HIC_CHECK(hipStreamCreate(&stream)); + + hipGraph_t new_graph; + hipGraphCreate(&new_graph, 0); + for (int i = 0; i < batchCount; ++i) { + if (m == 0 || n[i] == 0 || k[i] == 0) + continue; + + HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], + ldb, beta, C + offsetsC[i], ldc); + hipGraph_t my_graph; + HIC_CHECK(hipStreamEndCapture(stream, &my_graph)); + hipGraphNode_t my_node; + HIC_CHECK(hipGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, + my_graph)); + } + hipGraphExec_t instance; + HIC_CHECK(hipGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); + HIC_CHECK(hipStreamDestroy(stream)); + HIC_CHECK(hipGraphDestroy(new_graph)); + + graphCache.insert({key, instance}); + ptrCache.insert({key, std::make_tuple(A, B, C)}); + } + + HIC_CHECK(hipGraphLaunch(graphCache.at(key), stream)); +} + +// stupid simple gemm calls +template +void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, + int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB, + Real beta, Real *C, int ldc, int *offsetsC, int batchCount, + hipStream_t stream, int = -1) { + for (int i = 0; i < batchCount; ++i) { + if (m == 0 || n[i] == 0 || k[i] == 0) + continue; + gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb, + beta, C + offsetsC[i], ldc); + } +} + +#ifdef USE_CUTLASS +#include "hicblas_cutlass.cuda.h" +#endif + +namespace detail { +hipblasHandle_t get_hipblas_handle() { + static hipblasHandle_t handle; + if (!handle) + HICBLAS_CHECK(hipblasCreate(&handle)); + return handle; +} +template struct hipblas_gemm_grouped { +public: + hipblas_gemm_grouped(hipblasOperation_t transa, hipblasOperation_t transb) + : transa_(transa), transb_(transb) { + // we need to get the hipblas handle here, otherwise this could be created + // during graph capturing + get_hipblas_handle(); + }; + void operator()(hipStream_t stream, int m, int n, int k, Real alpha, + const Real *A, int lda, const Real *B, int ldb, Real beta, + Real *C, int ldc) const { + hipblasHandle_t handle = get_hipblas_handle(); + HICBLAS_CHECK(hipblasSetStream(handle, stream)); + + if constexpr (std::is_same::value) + HICBLAS_CHECK(hipblasSgemm(handle, transa_, transb_, m, n, k, &alpha, A, + lda, B, ldb, &beta, C, ldc)); + if constexpr (std::is_same::value) + HICBLAS_CHECK(hipblasDgemm(handle, transa_, transb_, m, n, k, &alpha, A, + lda, B, ldb, &beta, C, ldc)); + } + +private: + hipblasOperation_t transa_, transb_; +}; +} // namespace detail + +#ifndef USE_CUTLASS + +void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, + int m, int *n, int *k, float alpha, + const float *A, int lda, int *offsetsA, + const float *B, int ldb, int *offsetsB, float beta, + float *C, int ldc, int *offsetsC, + int batchCount, hipStream_t stream, + void *growing_allocator) { + + hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; + if (transa=='T' || transa=='t') + op_t1=HIPBLAS_OP_T; + if (transb=='T' || transb=='t') + op_t2=HIPBLAS_OP_T; + + using namespace detail; +#ifdef USE_GRAPHS_GEMM + run_group_graph(hipblas_gemm_grouped(op_t1, op_t2), m, n, k, alpha, A, + lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount, stream, blas_id, growing_allocator); +#else + run_group(hipblas_gemm_grouped(op_t1, op_t2), m, n, k, alpha, A, + lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount, stream); +#endif +} + +#endif + +void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, + int m, int *n, int *k, + double alpha, + const double *A, int lda, int *offsetsA, + const double *B, int ldb, int *offsetsB, + double beta, + double *C, int ldc, int *offsetsC, + int batchCount, hipStream_t stream, void *) { + + hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; + if (transa=='T' || transa=='t') + op_t1=HIPBLAS_OP_T; + if (transb=='T' || transb=='t') + op_t2=HIPBLAS_OP_T; + + using namespace detail; + run_group(hipblas_gemm_grouped(op_t1, op_t2), m, n, k, alpha, + A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount, stream, blas_id); +} + +} // namespace + +extern "C" { +void hipblas_dgemm_wrapper (char transa, char transb, + int m, int n,int k, double alpha, + const double *A, int lda, int tda, + const double *B, int ldb, int tdb, double beta, + double *C, int ldc, int tdc, int batchCount, + size_t stream, + void *growing_allocator) { + + hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; + + if (transa=='T' || transa=='t') + op_t1=HIPBLAS_OP_T; + if (transb=='T' || transb=='t') + op_t2=HIPBLAS_OP_T; + + if (!hip_alreadyAllocated_dgemm_handle){ + hipblasCreate(&handle_hip_dgemm); + hip_alreadyAllocated_dgemm_handle=true; + } + hipblasHandle_t handle = detail::get_hipblas_handle(); + HICBLAS_CHECK( + hipblasSetStream(handle, *(hipStream_t*)stream)); + + HICBLAS_CHECK(hipblasDgemmStridedBatched(handle,op_t1,op_t2,m,n,k, + &alpha,(const double *) A,lda,tda, (const double *) B,ldb,tdb, + &beta,(double *) C,ldc,tdc,batchCount)); + +} + +void hipblas_sgemm_wrapper (char transa, char transb, + int m, int n,int k, float alpha, + const float *A, int lda, int tda, + const float *B, int ldb, int tdb, float beta, + float *C, int ldc, int tdc, + int batchCount, + void *growing_allocator) { + + hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; + + if (transa=='T' || transa=='t') + op_t1=HIPBLAS_OP_T; + if (transb=='T' || transb=='t') + op_t2=HIPBLAS_OP_T; + + if (!hip_alreadyAllocated_sgemm_handle){ + hipblasCreate(&handle_hip_sgemm); + hip_alreadyAllocated_sgemm_handle=true; + } + HICBLAS_CHECK(hipblasSgemmStridedBatched(handle_hip_sgemm,op_t1,op_t2,m,n,k, + &alpha,(const float *) A,lda,tda, (const float *) B,ldb,tdb, + &beta,(float*) C,ldc,tdc,batchCount)); + +} + +void blas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, + int m, int *n, int *k, float alpha, + const float *A, int lda, int *offsetsA, + const float *B, int ldb, int *offsetsB, float beta, + float *C, int ldc, int *offsetsC, + int batchCount, size_t stream, + void *growing_allocator) { +#ifdef USE_CUTLASS + cutlass_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, + B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, + *(hipStream_t*)stream, + growing_allocator); +#else + hipblas_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, + offsetsA, B, ldb, offsetsB, beta, C, ldc, + offsetsC, batchCount, + *(hipStream_t*)stream, + growing_allocator); +#endif +} + +void blas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, + int m, int *n, int *k, double alpha, + const double *A, int lda, int *offsetsA, + const double *B, int ldb, int *offsetsB, double beta, + double *C, int ldc, int *offsetsC, + int batchCount, size_t stream, + void *growing_allocator) { + hipblas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, + ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, + *(hipStream_t*)stream, + growing_allocator); +} +} + +extern "C" void hipblasSgemmBatched_finalize () +{ + +#ifdef FALSE + if (hip_alreadyAllocated_sgemm){ + + hipFree(Aarray_sgemm_hip); + hipFree(Barray_sgemm_hip); + hipFree(Carray_sgemm_hip); + + hipFree(d_Aarray_sgemm_hip); + hipFree(d_Barray_sgemm_hip); + hipFree(d_Carray_sgemm_hip); + + } +#endif + + if (hip_alreadyAllocated_sgemm_handle){ + hipblasDestroy(handle_hip_sgemm); + } + if (hip_alreadyAllocated_dgemm_handle){ + hipblasDestroy(handle_hip_dgemm); + } + +} + diff --git a/src/trans/gpu/algor/hicblas_hip.h b/src/trans/gpu/algor/hicblas_hip.h new file mode 100644 index 00000000..1fa0d0cb --- /dev/null +++ b/src/trans/gpu/algor/hicblas_hip.h @@ -0,0 +1,78 @@ +// (C) Copyright 2000- ECMWF. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + +// Include hip runtime and hipblas headers + +#ifndef __HICBLAS_HIP_H__ +#define __HICBLAS_HIP_H__ + +#include +#include + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-W#pragma-messages" +#endif +#include "hipblas.h" +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + +inline static const char * _blasGetErrorEnum(hipblasStatus_t error) +{ + switch (error) + { + case HIPBLAS_STATUS_SUCCESS: + return "HIPBLAS_STATUS_SUCCESS"; + + case HIPBLAS_STATUS_NOT_INITIALIZED: + return "HIPBLAS_STATUS_NOT_INITIALIZED"; + + case HIPBLAS_STATUS_ALLOC_FAILED: + return "HIPBLAS_STATUS_ALLOC_FAILED"; + + case HIPBLAS_STATUS_INVALID_VALUE: + return "HIPBLAS_STATUS_INVALID_VALUE"; + + case HIPBLAS_STATUS_ARCH_MISMATCH: + return "HIPBLAS_STATUS_ARCH_MISMATCH"; + + case HIPBLAS_STATUS_MAPPING_ERROR: + return "HIPBLAS_STATUS_MAPPING_ERROR"; + + case HIPBLAS_STATUS_EXECUTION_FAILED: + return "HIPBLAS_STATUS_EXECUTION_FAILED"; + + case HIPBLAS_STATUS_INTERNAL_ERROR: + return "HIPBLAS_STATUS_INTERNAL_ERROR"; + } + + return ""; +} + +#define HICBLAS_CHECK(e) \ + { \ + hipblasStatus_t err = (e); \ + if (err != HIPBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "HIP error: %s, line %d, %s: %s\n", __FILE__, __LINE__, \ + #e, _blasGetErrorEnum(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define HIC_CHECK(e) \ + { \ + hipError_t err = (e); \ + if (err != hipSuccess) { \ + fprintf(stderr, "HIP error: %s, line %d, %s: %s\n", __FILE__, __LINE__, \ + #e, hipGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#endif diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 new file mode 100644 index 00000000..d6e2a5a3 --- /dev/null +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -0,0 +1,404 @@ +! (C) Copyright 2000- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +#if defined CUDAGPU +#define hipblasSgemm 'cublasSgemm' +#define hipblasDgemm 'cublasDgemm' +#define ACC_GET_HIP_STREAM ACC_GET_CUDA_STREAM +#define OPENACC_LIB OPENACC +#endif + +MODULE HICBLAS_MOD + +USE PARKIND1, ONLY : JPIM, JPRM, JPRD +USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE +USE ISO_C_BINDING +USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM + +IMPLICIT NONE + + INTERFACE HIP_GEMM_BATCHED + MODULE PROCEDURE HIP_DGEMM_BATCHED_OVERLOAD + MODULE PROCEDURE HIP_SGEMM_BATCHED_OVERLOAD + MODULE PROCEDURE HIP_DGEMM_GROUPED_OVERLOAD + MODULE PROCEDURE HIP_SGEMM_GROUPED_OVERLOAD + END INTERFACE HIP_GEMM_BATCHED + +! +! Define the interfaces to HIP/CUDA C code via a common wrapper interface +! +interface hip_gemm +! +! void hipblasSgemm (char transa, char transb, int m, int n, +! int k, float alpha, const float *A, int lda, +! const float *B, int ldb, float beta, float *C, int ldc) +! +SUBROUTINE HIP_SGEMM(CTA, CTB, M, N, K,& +ALPHA, A, LDA, B, LDB, BETA, C, LDC) BIND(C,NAME='hipblasSgemm') +USE ISO_C_BINDING +CHARACTER(1,C_CHAR),VALUE :: CTA, CTB +INTEGER(C_INT), VALUE :: M,N,K,LDA,LDB,LDC +REAL(C_FLOAT), VALUE :: ALPHA,BETA +REAL(C_FLOAT), DIMENSION(LDA,*) :: A +REAL(C_FLOAT), DIMENSION(LDB,*) :: B +REAL(C_FLOAT), DIMENSION(LDC,*) :: C +END SUBROUTINE HIP_SGEMM + +! +! void hipblasDgemm (char transa, char transb, int m, int n, +! int k, double alpha, const double *A, int lda, +! const double *B, int ldb, double beta, double *C, int ldc) +! +SUBROUTINE HIP_DGEMM(CTA, CTB, M, N, K,& +ALPHA, A, LDA, B, LDB, BETA, C, LDC) BIND(C,NAME='hipblasDgemm') +USE ISO_C_BINDING +CHARACTER(1,C_CHAR),VALUE :: CTA, CTB +INTEGER(C_INT), VALUE :: M,N,K,LDA,LDB,LDC +REAL(C_DOUBLE), VALUE :: ALPHA,BETA +REAL(C_DOUBLE), DIMENSION(LDA,*) :: A +REAL(C_DOUBLE), DIMENSION(LDB,*) :: B +REAL(C_DOUBLE), DIMENSION(LDC,*) :: C +END SUBROUTINE HIP_DGEMM +END INTERFACE + +INTERFACE + SUBROUTINE HIP_DGEMM_BATCHED( & + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT, STREAM, ALLOC & + &) BIND(C, NAME='hipblas_dgemm_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_DOUBLE), VALUE :: ALPHA,BETA + REAL(C_DOUBLE), DIMENSION(LDA,*) :: A + REAL(C_DOUBLE), DIMENSION(LDB,*) :: B + REAL(C_DOUBLE), DIMENSION(LDC,*) :: C + INTEGER(KIND=C_SIZE_T) :: STREAM + TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC + END SUBROUTINE HIP_DGEMM_BATCHED +END INTERFACE + +INTERFACE + SUBROUTINE HIP_DGEMM_STRIDED_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT, STREAM & + &) BIND(C, NAME='hipblasDgemmStridedBatched_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_INT), VALUE :: TDA,TDB,TDC + REAL(C_DOUBLE), VALUE :: ALPHA, BETA + REAL(C_DOUBLE), DIMENSION(LDA,*) :: A + REAL(C_DOUBLE), DIMENSION(LDB,*) :: B + REAL(C_DOUBLE), DIMENSION(LDC,*) :: C + INTEGER(KIND=C_SIZE_T) :: STREAM + END SUBROUTINE HIP_DGEMM_STRIDED_BATCHED +END INTERFACE + +INTERFACE + SUBROUTINE HIP_DGEMM_BATCHED_FINALIZE() BIND(C,NAME='hipblasDgemmBatched_finalize') + END SUBROUTINE HIP_DGEMM_BATCHED_FINALIZE +END INTERFACE + +INTERFACE + SUBROUTINE HIP_SGEMM_BATCHED( & + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT, STREAM, ALLOC & + &) BIND(C, NAME='hipblas_sgemm_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_FLOAT), VALUE :: ALPHA, BETA + REAL(C_FLOAT), DIMENSION(LDA,*) :: A + REAL(C_FLOAT), DIMENSION(LDB,*) :: B + REAL(C_FLOAT), DIMENSION(LDC,*) :: C + INTEGER(KIND=C_SIZE_T) :: STREAM + TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC + END SUBROUTINE HIP_SGEMM_BATCHED +END INTERFACE + +INTERFACE + SUBROUTINE HIP_SGEMM_STRIDED_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT, STREAM & + &) BIND(C, NAME='hipblasSgemmStridedBatched_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_INT), VALUE :: TDA,TDB,TDC + REAL(C_FLOAT), VALUE :: ALPHA, BETA + REAL(C_FLOAT), DIMENSION(LDA,*) :: A + REAL(C_FLOAT), DIMENSION(LDB,*) :: B + REAL(C_FLOAT), DIMENSION(LDC,*) :: C + INTEGER(KIND=C_SIZE_T) :: STREAM + END SUBROUTINE HIP_SGEMM_STRIDED_BATCHED +END INTERFACE + +INTERFACE + SUBROUTINE HIP_SGEMM_BATCHED_FINALIZE() BIND(C,NAME='hipblasSgemmBatched_finalize') + END SUBROUTINE HIP_SGEMM_BATCHED_FINALIZE +END INTERFACE + +INTERFACE +SUBROUTINE HIP_DGEMM_GROUPED( & + & BLAS_ID, CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, OFFSETA, & + & B, LDB, OFFSETB, & + & BETA, & + & C, LDC, OFFSETC, & + & BATCHCOUNT, STREAM, ALLOC & +&) BIND(C, NAME='blas_dgemm_wrapper_grouped') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) + REAL(C_DOUBLE), VALUE :: ALPHA,BETA + REAL(C_DOUBLE) :: A(*), B(*), C(*) + INTEGER(KIND=C_SIZE_T) :: STREAM + TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC +END SUBROUTINE HIP_DGEMM_GROUPED +SUBROUTINE HIP_SGEMM_GROUPED( & + & BLAS_ID, CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, OFFSETA, & + & B, LDB, OFFSETB, & + & BETA, & + & C, LDC, OFFSETC, & + & BATCHCOUNT, STREAM, ALLOC & +&) BIND(C, NAME='blas_sgemm_wrapper_grouped') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) + REAL(C_FLOAT), VALUE :: ALPHA,BETA + REAL(C_FLOAT) :: A(*), B(*), C(*) + INTEGER(KIND=C_SIZE_T) :: STREAM + TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC +END SUBROUTINE HIP_SGEMM_GROUPED +END INTERFACE + +CONTAINS + +SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT, STREAM, ALLOC) + CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N + INTEGER(KIND=JPIM) :: K + REAL(KIND=JPRD) :: ALPHA + REAL(KIND=JPRD), DIMENSION(:) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIM) :: STRIDEA + REAL(KIND=JPRD), DIMENSION(:,:) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: STRIDEB + REAL(KIND=JPRD) :: BETA + REAL(KIND=JPRD), DIMENSION(:) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIM) :: STRIDEC + INTEGER(KIND=JPIM) :: BATCHCOUNT + INTEGER(KIND=C_INT) :: STREAM + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + + INTEGER(KIND=C_LONG) :: HIP_STREAM + + HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) + +#if defined(_CRAYFTN) + !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) +#endif + CALL HIP_DGEMM_BATCHED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) +#if defined(_CRAYFTN) + !$ACC END HOST_DATA +#endif + END SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD + + SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT, STREAM, ALLOC) + CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N + INTEGER(KIND=JPIM) :: K + REAL(KIND=JPRM) :: ALPHA + REAL(KIND=JPRM), DIMENSION(:) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIM) :: STRIDEA + REAL(KIND=JPRM), DIMENSION(*) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: STRIDEB + REAL(KIND=JPRM) :: BETA + REAL(KIND=JPRM), DIMENSION(:) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIM) :: STRIDEC + INTEGER(KIND=JPIM) :: BATCHCOUNT + INTEGER(KIND=C_INT) :: STREAM + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + + INTEGER(KIND=C_LONG) :: HIP_STREAM + + HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) + + CALL HIP_SGEMM_BATCHED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) + END SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD + + + SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & + & BLAS_ID, TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, OFFSETA, & + & BARRAY, LDB, OFFSETB, & + & BETA, & + & CARRAY, LDC, OFFSETC, & + & BATCHCOUNT, STREAM, ALLOC) + INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID + CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N(:) + INTEGER(KIND=JPIM) :: K(:) + REAL(KIND=JPRD) :: ALPHA + REAL(KIND=JPRD), DIMENSION(:) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIM) :: OFFSETA(:) + REAL(KIND=JPRD), DIMENSION(*) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: OFFSETB(:) + REAL(KIND=JPRD) :: BETA + REAL(KIND=JPRD), DIMENSION(:) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIM) :: OFFSETC(:) + INTEGER(KIND=JPIM) :: BATCHCOUNT + INTEGER(KIND=C_INT) :: STREAM + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + + INTEGER(KIND=C_LONG) :: HIP_STREAM + + HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) + + CALL HIP_DGEMM_GROUPED( & + & BLAS_ID, TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, OFFSETA, & + & BARRAY, LDB, OFFSETB, & + & BETA, & + & CARRAY, LDC, OFFSETC, & + & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) + + END SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD + + SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& + & BLAS_ID, TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, OFFSETA, & + & BARRAY, LDB, OFFSETB, & + & BETA, & + & CARRAY, LDC, OFFSETC, & + & BATCHCOUNT, STREAM, ALLOC) + INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID + CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N(:) + INTEGER(KIND=JPIM) :: K(:) + REAL(KIND=JPRM) :: ALPHA + REAL(KIND=JPRM), DIMENSION(:) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIM) :: OFFSETA(:) + REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: OFFSETB(:) + REAL(KIND=JPRM) :: BETA + REAL(KIND=JPRM), DIMENSION(:) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIM) :: OFFSETC(:) + INTEGER(KIND=JPIM) :: BATCHCOUNT + INTEGER(KIND=C_INT) :: STREAM + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + + INTEGER(KIND=C_LONG) :: HIP_STREAM + + HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) + +#if defined(_CRAYFTN) + !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) +#endif + CALL HIP_SGEMM_GROUPED( & + & BLAS_ID, TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, OFFSETA, & + & BARRAY, LDB, OFFSETB, & + & BETA, & + & CARRAY, LDC, OFFSETC, & + & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) +#if defined(_CRAYFTN) + !$ACC END HOST_DATA +#endif + + END SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD + +END MODULE HICBLAS_MOD diff --git a/src/trans/gpu/algor/hicfft.h b/src/trans/gpu/algor/hicfft.h new file mode 100644 index 00000000..2db3cc7c --- /dev/null +++ b/src/trans/gpu/algor/hicfft.h @@ -0,0 +1,52 @@ +// (C) Copyright 2000- ECMWF. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + +// HIC--->FFT +// hip +// cuda +// +// Common header to provide abstraction layer to utilize hipfft and cufft from +// common wrapper calls. Runtime and library specific implementations are pulled +// in from bespoke header files. +// + +#ifndef __HICFFT_H__ +#define __HICFFT_H__ + +#include +#include +#include +#include +#include "abor1.h" + +#ifdef HIPGPU +#include "hicfft_hip.h" +#elif defined(CUDAGPU) +#include "hicfft_cuda.h" +#endif + +inline void _printError(const char * component, const char * file, const int line, int err, const char * err_str) { + fprintf(stderr, "%s error at 1\n", component); + fprintf(stderr, "%s error in file '%s'\n", component, file); + fprintf(stderr, "%s error at 2\n", component); + fprintf(stderr, "%s error line '%d'\n", component, line); + fprintf(stderr, "%s error at 3\n", component); + fprintf(stderr, "%s error %d: %s\nterminating!\n", component, err, err_str); + return; +} + +inline void __fftSafeCall(hipfftResult err, const char *file, const int line) +{ + if( hipSuccess != (int) err) { + _printError("GPU runtime", file, line, err, _fftGetErrorEnum(err)); + hipDeviceReset(); + ABOR1("Error in FFT\n"); + } +} + +#endif diff --git a/src/trans/gpu/algor/hicfft_create_plan.cuda.cu b/src/trans/gpu/algor/hicfft_create_plan.cuda.cu new file mode 120000 index 00000000..fdc9ab89 --- /dev/null +++ b/src/trans/gpu/algor/hicfft_create_plan.cuda.cu @@ -0,0 +1 @@ +hicfft_create_plan.hip.cpp \ No newline at end of file diff --git a/src/trans/gpu/algor/hicfft_create_plan.hip.cpp b/src/trans/gpu/algor/hicfft_create_plan.hip.cpp new file mode 100644 index 00000000..a1aa074c --- /dev/null +++ b/src/trans/gpu/algor/hicfft_create_plan.hip.cpp @@ -0,0 +1,79 @@ +#include "hicfft.h" + +#define fftSafeCall(err) __fftSafeCall(err, __FILE__, __LINE__) + +// static int allocatedWorkspace=0; +// static void* planWorkspace; +// static int planWorkspaceSize=100*1024*1024; //100MB +void *planWorkspace = nullptr; +static int currentWorkspaceSize = 0; + +extern "C" +void +hicfft_create_plan_(hipfftHandle * *plan, int *ISIGNp, int *Np, int *LOTp, int *stridep, int *plan_size) +{ + int ISIGN = *ISIGNp; + int N = *Np; + int LOT = *LOTp; + int stride = *stridep; + + *plan = new hipfftHandle; + + if (hipDeviceSynchronize() != hipSuccess){ + fprintf(stderr, "GPU runtime error: Failed to synchronize\n"); + return; + } + + int embed[1]; + int dist; + + #ifdef TRANS_SINGLE + hipfftType fft_dir = HIPFFT_R2C; + hipfftType fft_inv = HIPFFT_C2R; + #else + hipfftType fft_dir = HIPFFT_D2Z; + hipfftType fft_inv = HIPFFT_Z2D; + #endif + + embed[0] = 1; + dist = 1; + + fftSafeCall(hipfftCreate(*plan)); + + // Disable auto allocation + fftSafeCall(hipfftSetAutoAllocation(**plan, false)); + + if( ISIGN== -1 ){ + fftSafeCall(hipfftPlanMany(*plan, 1, &N, + embed, stride, dist, + embed, stride, dist, + fft_dir, LOT)); + } else if( ISIGN== 1){ + fftSafeCall(hipfftPlanMany(*plan, 1, &N, + embed, stride, dist, + embed, stride, dist, + fft_inv, LOT)); + } else { + abort(); + } + + // get size used by this plan + size_t thisWorkplanSize; + hipfftGetSize(**plan, &thisWorkplanSize); + + // check if this the work space is sufficiently large + if (thisWorkplanSize > currentWorkspaceSize) { + hipDeviceSynchronize(); + hipFree(planWorkspace); + hipMalloc(&planWorkspace, thisWorkplanSize); + currentWorkspaceSize = thisWorkplanSize; + } + + if (hipDeviceSynchronize() != hipSuccess){ + fprintf(stderr, "GPU runtime error: Failed to synchronize\n"); + return; + } + + return; + +} diff --git a/src/trans/gpu/algor/hicfft_cuda.h b/src/trans/gpu/algor/hicfft_cuda.h new file mode 100644 index 00000000..89745331 --- /dev/null +++ b/src/trans/gpu/algor/hicfft_cuda.h @@ -0,0 +1,140 @@ +// (C) Copyright 2000- ECMWF. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + +// Include cufft header and provide CPP macros to rewrite HIP and hipfft names +// to CUDA and cufft names + +#ifndef __HICFFT_CUDA_H__ +#define __HICFFT_CUDA_H__ + +#include "cufft.h" + +// Library Names +#define hipfft cufft +#define HIPFFT CUFFT + +// CPP macros +#define HIPFFT_SUCCESS CUFFT_SUCCESS +#define HIPFFT_R2C CUFFT_R2C +#define HIPFFT_C2R CUFFT_C2R +#define HIPFFT_D2Z CUFFT_D2Z +#define HIPFFT_Z2D CUFFT_Z2D + +// Constants and types +#define hipError_t cudaError_t +#define hipSuccess cudaSuccess +#define hipfftHandle cufftHandle +#define hipfftType cufftType +#define hipfftResult cufftResult +#define hipfftComplex cufftComplex +#define hipfftReal cufftReal +#define hipfftDoubleComplex cufftDoubleComplex +#define hipfftDoubleReal cufftDoubleReal +#define hipGraph_t cudaGraph_t +#define hipGraphNode_t cudaGraphNode_t +#define hipGraphExec_t cudaGraphExec_t +#define hipStream_t cudaStream_t + +#define hipfftCreate cufftCreate +#define hipfftDestroy cufftDestroy +#define hipfftPlanMany cufftPlanMany +#define hipfftGetSize cufftGetSize +#define hipfftSetAutoAllocation cufftSetAutoAllocation +#define hipfftSetStream cufftSetStream +#define hipStreamCreate cudaStreamCreate +#define hipStreamDestroy cudaStreamDestroy +#define hipSetStream cudaSetStream +#define hipGraphCreate cudaGraphCreate +#define hipGraphDestroy cudaGraphDestroy +#define hipGraphLaunch cudaGraphLaunch +#define hipGraphExecDestroy cudaGraphExecDestroy +#define hipStreamCaptureModeGlobal cudaStreamCaptureModeGlobal +#define hipStreamBeginCapture cudaStreamBeginCapture +#define hipStreamEndCapture cudaStreamEndCapture +#define hipGraphAddChildGraphNode cudaGraphAddChildGraphNode +#define hipGraphInstantiate cudaGraphInstantiate + +#define hipfftExecR2C cufftExecR2C +#define hipfftExecC2R cufftExecC2R +#define hipfftExecD2Z cufftExecD2Z +#define hipfftExecZ2D cufftExecZ2D + +// Runtime calls +#define hipDeviceSynchronize cudaDeviceSynchronize +#define hipDeviceReset cudaDeviceReset +#define _hipGetErrorEnum _cudaGetErrorEnum +#define hipFree cudaFree +#define hipMalloc cudaMalloc + +inline static const char * _fftGetErrorEnum(cufftResult error) +{ + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} + + #define HIC_CHECK(e) \ + { \ + cudaError_t err = (e); \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error: %s, line %d, %s: %s\n", __FILE__, __LINE__, \ + #e, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#endif diff --git a/src/trans/gpu/algor/hicfft_destroy_plan.cuda.cu b/src/trans/gpu/algor/hicfft_destroy_plan.cuda.cu new file mode 120000 index 00000000..398313b5 --- /dev/null +++ b/src/trans/gpu/algor/hicfft_destroy_plan.cuda.cu @@ -0,0 +1 @@ +hicfft_destroy_plan.hip.cpp \ No newline at end of file diff --git a/src/trans/gpu/algor/hicfft_destroy_plan.hip.cpp b/src/trans/gpu/algor/hicfft_destroy_plan.hip.cpp new file mode 100644 index 00000000..1e5e1fb1 --- /dev/null +++ b/src/trans/gpu/algor/hicfft_destroy_plan.hip.cpp @@ -0,0 +1,23 @@ +#include "hicfft.h" + +#define fftSafeCall(err) __fftSafeCall(err, __FILE__, __LINE__) + +extern "C" +void +hicfft_destroy_plan_(hipfftHandle *PLANp) +{ + hipfftHandle plan = *PLANp; + + if (hipDeviceSynchronize() != hipSuccess){ + fprintf(stderr, "GPU runtime error: Failed to synchronize\n"); + return; + } + + fftSafeCall(hipfftDestroy(plan)); + + if (hipDeviceSynchronize() != hipSuccess){ + fprintf(stderr, "GPU runtime error: Failed to synchronize\n"); + return; + } + +} diff --git a/src/trans/gpu/algor/hicfft_execute_plan.cuda.cu b/src/trans/gpu/algor/hicfft_execute_plan.cuda.cu new file mode 120000 index 00000000..3dd81ece --- /dev/null +++ b/src/trans/gpu/algor/hicfft_execute_plan.cuda.cu @@ -0,0 +1 @@ +hicfft_execute_plan.hip.cpp \ No newline at end of file diff --git a/src/trans/gpu/algor/hicfft_execute_plan.hip.cpp b/src/trans/gpu/algor/hicfft_execute_plan.hip.cpp new file mode 100644 index 00000000..4a92e1fd --- /dev/null +++ b/src/trans/gpu/algor/hicfft_execute_plan.hip.cpp @@ -0,0 +1,240 @@ +#include "hicfft.h" + +#include "growing_allocator.h" + +#define fftSafeCall(err) __fftSafeCall(err, __FILE__, __LINE__) + +#ifdef TRANS_SINGLE +typedef float DATA_TYPE; +typedef hipfftComplex HIP_DATA_TYPE_COMPLEX; +typedef hipfftReal HIP_DATA_TYPE_REAL; +#define fftExecDir hipfftExecR2C +#define fftExecInv hipfftExecC2R +#else +typedef double DATA_TYPE; +typedef hipfftDoubleComplex HIP_DATA_TYPE_COMPLEX; +typedef hipfftDoubleReal HIP_DATA_TYPE_REAL; +#define fftExecDir hipfftExecD2Z +#define fftExecInv hipfftExecZ2D +#endif + +__global__ void debug(int varId, int N, HIP_DATA_TYPE_COMPLEX *x) { + for (int i = 0; i < N; i++) + { + HIP_DATA_TYPE_COMPLEX a = x[i]; + double b = (double)a.x; + double c = (double)a.y; + if (varId == 0) printf("GPU: input[%d]=(%2.4f,%2.4f)\n",i+1,b,c); + if (varId == 1) printf("GPU: output[%d]=(%2.4f,%2.4f)\n",i+1,b,c); + } +} + +__global__ void debugFloat(int varId, int N, HIP_DATA_TYPE_REAL *x) { + for (int i = 0; i < N; i++) + { + double a = (double)x[i]; + if (varId == 0) printf("GPU: input[%d]=%2.4f\n",i+1,a); + if (varId == 1) printf("GPU: output[%d]=%2.4f\n",i+1,a); + } +} + +extern "C" +void +hicfft_execute_plan_(int ISIGNp, int N, DATA_TYPE *data_in_host, DATA_TYPE *data_out_host, long *iplan) +{ + HIP_DATA_TYPE_COMPLEX *data_in = reinterpret_cast(data_in_host); + HIP_DATA_TYPE_COMPLEX *data_out = reinterpret_cast(data_out_host); + hipfftHandle* PLANp = reinterpret_cast(iplan); + hipfftHandle plan = *PLANp; + int ISIGN = ISIGNp; + + /*if (hipDeviceSynchronize() != hipSuccess){ + fprintf(stderr, "GPU runtime error: Failed to synchronize\n"); + return; + }*/ + + if( ISIGN== -1 ){ + fftSafeCall(fftExecDir(plan, (HIP_DATA_TYPE_REAL*)data_in, data_out)); + } + else if( ISIGN== 1){ + fftSafeCall(fftExecInv(plan, data_in, (HIP_DATA_TYPE_REAL*)data_out)); + } + else { + abort(); + } + + if (hipDeviceSynchronize() != hipSuccess){ + fprintf(stderr, "GPU runtime error: Failed to synchronize\n"); + return; + } + +} + +namespace { +struct Double { + using real = double; + using cmplx = hipfftDoubleComplex; +}; +struct Float { + using real = float; + using cmplx = hipfftComplex; +}; + +// kfield -> handles +template auto &get_fft_plan_cache() { + static std::unordered_map> fftPlansCache; + return fftPlansCache; +} +// kfield -> graphs +template auto &get_graph_cache() { + static std::unordered_map graphCache; + return graphCache; +} +// kfield -> ptrs +template auto &get_ptr_cache() { + using real = typename Type::real; + using cmplx = typename Type::cmplx; + static std::unordered_map> ptrCache; + return ptrCache; +} + +template +void free_fft_cache(float *, size_t) { + get_graph_cache().clear(); + get_ptr_cache().clear(); +} + + + +template +void execute_fft_new(typename Type::real *data_real, typename Type::cmplx *data_complex, + int kfield, int *loens, int *offsets, int nfft, void *growing_allocator) { + + growing_allocator_register_free_c(growing_allocator, + free_fft_cache); + + constexpr bool is_forward = Direction == HIPFFT_R2C || Direction == HIPFFT_D2Z; + using real = typename Type::real; + using cmplx = typename Type::cmplx; + + /* static std::unordered_map allocationCache; // nloens -> ptr */ +//* static std::unordered_map> fftPlansCache; // kfield -> handles +//* static std::unordered_map graphCache; // kfield -> graphs + + // if the pointers are changed, we need to update the graph +//* static std::unordered_map> ptrCache; // kfield -> ptrs + auto &ptrCache = get_ptr_cache(); // kfield -> ptrs + auto &graphCache = get_graph_cache(); // kfield -> graphs + + auto ptrs = ptrCache.find(kfield); + if (ptrs != ptrCache.end() && ( + ptrs->second.first != data_real || ptrs->second.second != data_complex)) { + // the plan is cached, but the pointers are not correct. we remove and delete the graph, + // but we keep the FFT plans, if this happens more often, we should cache this... + std::cout << "WARNING FFT: POINTER CHANGE --> THIS MIGHT BE SLOW" + << std::endl; + HIC_CHECK(hipGraphExecDestroy(graphCache[kfield])); + graphCache.erase(kfield); + ptrCache.erase(kfield); + } + +//* auto &fftPlansCache = +//* get_fft_plan_cache(); // kfield -> handles + auto graph = graphCache.find(kfield); + if (graph == graphCache.end()) { + // this graph does not exist yet + + auto &fftPlansCache = + get_fft_plan_cache(); // kfield -> handles + auto fftPlans = fftPlansCache.find(kfield); + if (fftPlans == fftPlansCache.end()) { + // the fft plans do not exist yet + std::vector newPlans; + newPlans.resize(nfft); + for (int i = 0; i < nfft; ++i) { + int nloen = loens[i]; + + hipfftHandle plan; + fftSafeCall(hipfftCreate(&plan)); + int dist = offsets[i+1] - offsets[i]; + int embed[] = {1}; + //fftSafeCall(hipfftPlanMany(&plan, 1, &nloen, embed, 1, dist, embed, + // 1, dist / 2, Direction, kfield)); + fftSafeCall(hipfftPlanMany(&plan, 1, &nloen, embed, 1, is_forward ? dist : dist / 2, embed, + 1, is_forward ? dist / 2 : dist, Direction, kfield)); + newPlans[i] = plan; + } + fftPlansCache.insert({kfield, newPlans}); + } + fftPlans = fftPlansCache.find(kfield); + + // create a temporary stream + hipStream_t stream; + HIC_CHECK(hipStreamCreate(&stream)); + + for (auto &plan : fftPlans->second) // set the streams + fftSafeCall(hipfftSetStream(plan, stream)); + + // now create the graph + hipGraph_t new_graph; + hipGraphCreate(&new_graph, 0); + for (int i = 0; i < nfft; ++i) { + int offset = offsets[i]; + real *data_real_l = &data_real[kfield * offset]; + cmplx *data_complex_l = &data_complex[kfield * offset / 2]; + HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + if constexpr(Direction == HIPFFT_R2C) + fftSafeCall(hipfftExecR2C(fftPlans->second[i], data_real_l, data_complex_l)); + else if constexpr(Direction == HIPFFT_C2R) + fftSafeCall(hipfftExecC2R(fftPlans->second[i], data_complex_l, data_real_l)); + else if constexpr(Direction == HIPFFT_D2Z) + fftSafeCall(hipfftExecD2Z(fftPlans->second[i], data_real_l, data_complex_l)); + else if constexpr(Direction == HIPFFT_Z2D) + fftSafeCall(hipfftExecZ2D(fftPlans->second[i], data_complex_l, data_real_l)); + hipGraph_t my_graph; + HIC_CHECK(hipStreamEndCapture(stream, &my_graph)); + hipGraphNode_t my_node; + HIC_CHECK(hipGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, my_graph)); + } + hipGraphExec_t instance; + HIC_CHECK(hipGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); + HIC_CHECK(hipStreamDestroy(stream)); + HIC_CHECK(hipGraphDestroy(new_graph)); + + graphCache.insert({kfield, instance}); + ptrCache.insert({kfield, std::make_pair(data_real, data_complex)}); + } + + HIC_CHECK(hipGraphLaunch(graphCache.at(kfield), 0)); + HIC_CHECK(hipDeviceSynchronize()); +} +} // namespace + + +extern "C" { +void execute_dir_fft_float(float *data_real, hipfftComplex *data_complex, + int kfield, int *loens, int *offsets, int nfft, + void *growing_allocator) { + execute_fft_new(data_real, data_complex, kfield, loens, offsets, + nfft, growing_allocator); +} +void execute_inv_fft_float(hipfftComplex *data_complex, float *data_real, + int kfield, int *loens, int *offsets, int nfft, + void *growing_allocator) { + execute_fft_new(data_real, data_complex, kfield, loens, offsets, + nfft, growing_allocator); +} +void execute_dir_fft_double(double *data_real, hipfftDoubleComplex *data_complex, + int kfield, int *loens, int *offsets, int nfft, + void *growing_allocator) { + execute_fft_new(data_real, data_complex, kfield, loens, + offsets, nfft, growing_allocator); +} +void execute_inv_fft_double(hipfftDoubleComplex *data_complex, double *data_real, + int kfield, int *loens, int *offsets, int nfft, + void *growing_allocator) { + execute_fft_new(data_real, data_complex, kfield, loens, + offsets, nfft, growing_allocator); +} +} + diff --git a/src/trans/gpu/algor/hicfft_hip.h b/src/trans/gpu/algor/hicfft_hip.h new file mode 100644 index 00000000..3fc176ed --- /dev/null +++ b/src/trans/gpu/algor/hicfft_hip.h @@ -0,0 +1,91 @@ +// (C) Copyright 2000- ECMWF. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + +// Include hip runtime and hipfft headers and provide error enum translation + +#ifndef __HICFFT_HIP_H__ +#define __HICFFT_HIP_H__ + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-W#pragma-messages" +#endif +#include +#include "hipfft.h" +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + +inline static const char * _fftGetErrorEnum(hipfftResult error) +{ + switch (error) + { + case HIPFFT_SUCCESS: + return "HIPFFT_SUCCESS"; + + case HIPFFT_INVALID_PLAN: + return "HIPFFT_INVALID_PLAN"; + + case HIPFFT_ALLOC_FAILED: + return "HIPFFT_ALLOC_FAILED"; + + case HIPFFT_INVALID_TYPE: + return "HIPFFT_INVALID_TYPE"; + + case HIPFFT_INVALID_VALUE: + return "HIPFFT_INVALID_VALUE"; + + case HIPFFT_INTERNAL_ERROR: + return "HIPFFT_INTERNAL_ERROR"; + + case HIPFFT_EXEC_FAILED: + return "HIPFFT_EXEC_FAILED"; + + case HIPFFT_SETUP_FAILED: + return "HIPFFT_SETUP_FAILED"; + + case HIPFFT_INVALID_SIZE: + return "HIPFFT_INVALID_SIZE"; + + case HIPFFT_UNALIGNED_DATA: + return "HIPFFT_UNALIGNED_DATA"; + + case HIPFFT_INCOMPLETE_PARAMETER_LIST: + return "HIPFFT_INCOMPLETE_PARAMETER_LIST"; + + case HIPFFT_INVALID_DEVICE: + return "HIPFFT_INVALID_DEVICE"; + + case HIPFFT_PARSE_ERROR: + return "HIPFFT_PARSE_ERROR"; + + case HIPFFT_NO_WORKSPACE: + return "HIPFFT_NO_WORKSPACE"; + + case HIPFFT_NOT_IMPLEMENTED: + return "HIPFFT_NOT_IMPLEMENTED"; + + case HIPFFT_NOT_SUPPORTED: + return "HIPFFT_NOT_SUPPORTED"; + } + + return ""; +} + +#define HIC_CHECK(e) \ + { \ + hipError_t err = (e); \ + if (err != hipSuccess) { \ + fprintf(stderr, "HIP error: %s, line %d, %s: %s\n", __FILE__, __LINE__, \ + #e, hipGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + + +#endif diff --git a/src/trans/gpu/algor/seefmm_mix.F90 b/src/trans/gpu/algor/seefmm_mix.F90 new file mode 100644 index 00000000..5fa12535 --- /dev/null +++ b/src/trans/gpu/algor/seefmm_mix.F90 @@ -0,0 +1,548 @@ +! (C) Copyright 2014- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +module seefmm_mix +!**** *SEEFMM_MIX* - Implementation of Simple Exponential Expansion FMM + +! Purpose. +! -------- +! Implementation of Simple Exponential Expansion FMM + +!** Interface. +! ---------- + +! Method. +! ------- +! Based on Algorithm described in Section 4 of the article +! "An improved fast multipole algorithm for potential fields on the line " + + +! Reference. +! ---------- +! "An improved fast multipole algorithm for potential fields on the line " +! by Norman Yarvin and Vladimir Rohklin, SIAM J. Numer. Anal. Vol. 36,No. 2,629-666. [1] +! +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 2009-06-04 +! ------------------------------------------------------------------ + + +use parkind_ectrans,only : jpim ,JPRBT, jprd +use ecsort_mix +use wts500_mod + +integer(kind=jpim) :: nfmm_lim=200 ! Appr. break-even limit for FMM +integer(kind=jpim),parameter :: nquadEm14=28 ! Quadrature size for eps~=1.e-14 +integer(kind=jpim),parameter :: nquadEm10=20! Quadrature size for eps~=1.e-10 +integer(kind=jpim),parameter :: nquadEm07=14! Quadrature size for eps~=1.e-07 + +type fmm_type +integer(kind=jpim) :: nxy ! Total number of point "nx+ny" +integer(kind=jpim) :: nx ! Number of 'x' points +integer(kind=jpim) :: nquad ! Quadrature N +integer(kind=jpim) :: ncik ! Number of elem. in cik +real(kind=JPRBT) :: rw(56) ! Quadrature weights +real(kind=JPRBT) , pointer :: rdexp(:,:) ! exp(xy(i)-xy(i-1)) +integer(kind=jpim), pointer :: index(:) ! index for sorted xy +integer(kind=jpim), pointer :: nclose(:) ! No of "close" points +real(kind=JPRBT) , pointer :: cik(:) ! Correction term (142 in [1]) + +end type fmm_type + +contains +recursive subroutine setup_seefmm(kx,px,ky,py,ydfmm,pdiff) + +implicit none + +!**** *SETUP_SEEFMM* - Setup seefmm + +! Purpose - Pre-computations for applying SEEFMM + +! Explicit arguments : +! -------------------- +! kx - Number of x points +! px - x points +! ky - Number of y points +! py - y points +! ydfmm - result of pre-computations +! pdiff - difference matrix (optional) + +integer(kind=jpim),intent(in) :: kx +real(kind=jprd) ,intent(in) :: px(:) +integer(kind=jpim),intent(in) :: ky +real(kind=JPRBT) ,intent(in) :: py(:) +type(fmm_type) ,intent(out) :: ydfmm +real(kind=JPRBT),optional,intent(in) :: pdiff(:,:) + +real(kind=JPRBT) :: zxy(kx+ky),zrt(56),zcik((kx+ky)*(kx+ky)) +real(kind=JPRBT) :: zr +integer(kind=jpim) :: ixy +!--------------------------------------------------------------------------- +ydfmm%nx=kx +ixy=kx+ky +ydfmm%nxy=ixy +allocate(ydfmm%index(ixy)) +!ydfmm%nquad=nquadEm14 !Set precicion to 1.E-14 +ydfmm%nquad=nquadEm07 !Set precicion to 1.E-07 +! Combine px and py to form xxy, compute ascending index for xxy +call comb_xy(kx,px,ky,py,ixy,zxy,ydfmm%index) +! Setup quadrature, scale (see 3.1.1 in [1]) +call suquad(ixy,zxy(ydfmm%index(1))-zxy(ydfmm%index(ixy)),& + & ydfmm%nquad,ydfmm%rw,zrt,zr) +allocate(ydfmm%rdexp(ydfmm%nquad,ixy)) +allocate(ydfmm%nclose(ixy)) +! Main pre-computation +call prepotf(kx,ixy,ydfmm%nquad,ydfmm%rw,zrt,zr,zxy,ydfmm%index,& + & ydfmm%rdexp,ydfmm%nclose,zcik,ydfmm%ncik,pdiff) +! Needed as size of cik unknown beforehand +allocate(ydfmm%cik(ydfmm%ncik)) +ydfmm%cik(:)=zcik(1:ydfmm%ncik) + +end subroutine setup_seefmm +!========================================================================== + +subroutine free_seefmm(ydfmm) +implicit none + +!**** *FREE_SEEFMM* - Release memory + +! Purpose - Release memory used by ydfmm + +! Explicit arguments : +! -------------------- +! ydfmm - result of pre-computations +type(fmm_type) ,intent(inout) :: ydfmm + +deallocate(ydfmm%index) +deallocate(ydfmm%rdexp) +deallocate(ydfmm%nclose) +deallocate(ydfmm%cik) + +end subroutine free_seefmm + +!========================================================================== +recursive subroutine potf(kn,kx,ldxout,kquad,prw,pq,prdexp,kindex,kclosel,kcik,pcik,ptheta) +implicit none + +integer(kind=jpim),intent(in) :: kn +integer(kind=jpim),intent(in) :: kx +logical ,intent(in) :: ldxout +integer(kind=jpim),intent(in) :: kquad +real(kind=JPRBT) ,intent(in) :: prw(:) +real(kind=JPRBT) ,intent(in) :: pq(:) +real(kind=JPRBT) ,intent(in) :: prdexp(:,:) +integer(kind=jpim),intent(in) :: kindex(:) +integer(kind=jpim),intent(in) :: kclosel(:) +integer(kind=jpim),intent(in) :: kcik +real(kind=JPRBT) ,intent(in) :: pcik(:) +real(kind=JPRBT) ,intent(out) :: ptheta(:) + +real(kind=JPRBT) :: zalpha(kquad),zq(kn),ztheta(kn) +integer(kind=jpim) :: j1,j2,jm,inumc,idist,iquad +integer(kind=jpim) :: iout,iq,i1,i1p1,i1pd,ik1,ix,iy +logical :: lxy,llxy(kn) + +lxy(ik1) = (ik1 <= kx .eqv. ldxout) +!------------------------------------------------------------------------- + +ztheta(:)=0.0_JPRBT +if(ldxout) then + ix=0 + iy=-kx +else + ix=-kx + iy=0 +endif + +do j1=1,kn + i1=kindex(j1) + llxy(j1)=lxy(i1) + if(llxy(j1)) then + zq(j1)=pq(kindex(j1)+ix) + else + zq(j1)=0.0_JPRBT + endif +enddo + +zalpha(:)=zq(1) +do j1=2,kn + if(llxy(j1)) then + do jm=1,kquad + zalpha(jm)=zalpha(jm)*prdexp(jm,j1)+zq(j1) + enddo + else + do jm=1,kquad + zalpha(jm)=zalpha(jm)*prdexp(jm,j1) + ztheta(j1)=ztheta(j1)+prw(jm)*zalpha(jm) + enddo + endif +enddo + +zalpha(1:kquad)=zq(kn) +do j1=kn-1,1,-1 + if(llxy(j1)) then + do jm=1,kquad + zalpha(jm)=zalpha(jm)*prdexp(jm,j1+1)+zq(j1) + enddo + else + do jm=1,kquad + zalpha(jm)=zalpha(jm)*prdexp(jm,j1+1) + ztheta(j1)=ztheta(j1)-prw(jm)*zalpha(jm) + enddo + endif +enddo + + +IF(kcik > 0) then + inumc=0 + do j1=1,kn-1 + do j2=1,kclosel(j1) + idist=j2 + if(.not.llxy(j1) .and. llxy(j1+idist)) then + inumc=inumc+1 + ztheta(j1)=ztheta(j1)-pcik(inumc)*zq(j1+idist) + elseif(llxy(j1) .and. .not.llxy(j1+idist)) then + inumc=inumc+1 + ztheta(j1+idist)=ztheta(j1+idist)+pcik(inumc)*zq(j1) + endif + enddo + enddo +endif + +do j1=1,kn + if(.not. llxy(j1)) then + i1=kindex(j1) + ptheta(i1+iy)=ztheta(j1) + endif +enddo + +end subroutine potf +!========================================================================== +recursive subroutine seefmm_mulv(ydfmm,ldxout,pq,ptheta) +implicit none + +type(fmm_type) ,intent(in) :: ydfmm +logical ,intent(in) :: ldxout +real(kind=JPRBT) ,intent(in) :: pq(:) +real(kind=JPRBT) ,intent(out) :: ptheta(:) + +!------------------------------------------------------------------------- +call potf(ydfmm%nxy,ydfmm%nx,ldxout,ydfmm%nquad,& + & ydfmm%rw,pq,ydfmm%rdexp,ydfmm%index,& + & ydfmm%nclose,ydfmm%ncik,ydfmm%cik,ptheta) + +end subroutine seefmm_mulv +!========================================================================== +recursive subroutine seefmm_mulm(ydfmm,km,kskip,ldxout,pq,ptheta) +implicit none + +type(fmm_type) ,intent(in) :: ydfmm +integer(kind=jpim),intent(in) :: km +integer(kind=jpim),intent(in) :: kskip +logical ,intent(in) :: ldxout +real(kind=JPRBT) ,intent(in) :: pq(:,:) +real(kind=JPRBT) ,intent(out) :: ptheta(:,:) + +!------------------------------------------------------------------------- +call potfm(ydfmm%nxy,km,kskip,ydfmm%nx,ldxout,ydfmm%nquad,& + & ydfmm%rw,pq,ydfmm%rdexp,ydfmm%index,& + & ydfmm%nclose,ydfmm%ncik,ydfmm%cik,ptheta) +end subroutine seefmm_mulm +!========================================================================== + +recursive subroutine potfm(kn,km,kskip,kx,ldxout,kquad,prw,pq,prdexp,kindex,kclosel,kcik,pcik,ptheta) +implicit none + +integer(kind=jpim),intent(in) :: kn +integer(kind=jpim),intent(in) :: km +integer(kind=jpim),intent(in) :: kskip +integer(kind=jpim),intent(in) :: kx +logical ,intent(in) :: ldxout +integer(kind=jpim),intent(in) :: kquad +real(kind=JPRBT) ,intent(in) :: prw(:) +real(kind=JPRBT) ,intent(in) :: pq(:,:) +real(kind=JPRBT) ,intent(in) :: prdexp(:,:) +integer(kind=jpim),intent(in) :: kindex(:) +integer(kind=jpim),intent(in) :: kclosel(:) +integer(kind=jpim),intent(in) :: kcik +real(kind=JPRBT) ,intent(in) :: pcik(:) +real(kind=JPRBT) ,intent(out) :: ptheta(:,:) + +real(kind=JPRBT) :: zalpha(kquad,km) +integer(kind=jpim) :: j1,j2,jm,jq,inumc,idist,iquad +integer(kind=jpim) :: iout,iq,i1,i1p1,i1pd,ik1,ix,iy +logical :: lxy,llxy(kn) + +lxy(ik1) = (ik1 <= kx .eqv. ldxout) +!------------------------------------------------------------------------- + +!CALL GSTATS(209,0) +ptheta(:,:)=0.0_JPRBT +if(ldxout) then + ix=0 + iy=-kx +else + ix=-kx + iy=0 +endif +do j1=1,kn + i1=kindex(j1) + llxy(j1)=lxy(i1) +enddo + +if(llxy(1)) then + do jm=1,km,kskip + zalpha(:,jm)=pq(jm,kindex(1)+ix) + enddo +else + zalpha(:,:)=0.0_JPRBT +endif +!CALL GSTATS(209,1) +!CALL GSTATS(210,0) +do j1=2,kn + i1=kindex(j1) + if(llxy(j1) ) then + if( kskip==1 )then + do jq=1,kquad + do jm=1,km + zalpha(jq,jm)=zalpha(jq,jm)*prdexp(jq,j1) + zalpha(jq,jm)=zalpha(jq,jm)+pq(jm,i1+ix) + enddo + enddo + else + do jq=1,kquad + do jm=1,km,kskip + zalpha(jq,jm)=zalpha(jq,jm)*prdexp(jq,j1) + zalpha(jq,jm)=zalpha(jq,jm)+pq(jm,i1+ix) + enddo + enddo + endif + else + if( kskip==1 )then + do jq=1,kquad + do jm=1,km + zalpha(jq,jm)=zalpha(jq,jm)*prdexp(jq,j1) + ptheta(jm,i1+iy)=ptheta(jm,i1+iy)+prw(jq)*zalpha(jq,jm) + enddo + enddo + else + do jq=1,kquad + do jm=1,km,kskip + zalpha(jq,jm)=zalpha(jq,jm)*prdexp(jq,j1) + ptheta(jm,i1+iy)=ptheta(jm,i1+iy)+prw(jq)*zalpha(jq,jm) + enddo + enddo + endif + endif +enddo +!CALL GSTATS(210,1) + +!CALL GSTATS(211,0) +if(llxy(kn)) then + do jm=1,km,kskip + zalpha(:,jm)=pq(jm,kindex(kn)+ix) + enddo +else + zalpha(:,:)=0.0 +endif +!CALL GSTATS(211,1) +!CALL GSTATS(212,0) +do j1=kn-1,1,-1 + i1=kindex(j1) + i1p1=kindex(j1+1) + if(llxy(j1)) then + if( kskip==1 )then + do jq=1,kquad + do jm=1,km + zalpha(jq,jm)=zalpha(jq,jm)*prdexp(jq,j1+1) + zalpha(jq,jm)=zalpha(jq,jm)+pq(jm,i1+ix) + enddo + enddo + else + do jq=1,kquad + do jm=1,km,kskip + zalpha(jq,jm)=zalpha(jq,jm)*prdexp(jq,j1+1) + zalpha(jq,jm)=zalpha(jq,jm)+pq(jm,i1+ix) + enddo + enddo + endif + else + if( kskip==1 )then + do jq=1,kquad + do jm=1,km + zalpha(jq,jm)=zalpha(jq,jm)*prdexp(jq,j1+1) + ptheta(jm,i1+iy)=ptheta(jm,i1+iy)-prw(jq)*zalpha(jq,jm) + enddo + enddo + else + do jq=1,kquad + do jm=1,km,kskip + zalpha(jq,jm)=zalpha(jq,jm)*prdexp(jq,j1+1) + ptheta(jm,i1+iy)=ptheta(jm,i1+iy)-prw(jq)*zalpha(jq,jm) + enddo + enddo + endif + endif +enddo +!CALL GSTATS(212,1) + + +IF(kcik > 0) then +! CALL GSTATS(213,0) + inumc=0 + do j1=1,kn-1 + do j2=1,kclosel(j1) + idist=j2 + i1=kindex(j1) + i1pd=kindex(j1+idist) + if(.not.llxy(j1) .and. llxy(j1+idist)) then + inumc=inumc+1 + do jm=1,km,kskip + ptheta(jm,i1+iy)=ptheta(jm,i1+iy)-pcik(inumc)*pq(jm,i1pd+ix) + enddo + elseif(llxy(j1) .and. .not.llxy(j1+idist)) then + inumc=inumc+1 + do jm=1,km,kskip + ptheta(jm,i1pd+iy)=ptheta(jm,i1pd+iy)+pcik(inumc)*pq(jm,i1+ix) + enddo + endif + enddo + enddo +! CALL GSTATS(213,1) +endif + +end subroutine potfm +!========================================================================= +recursive subroutine suquad(kn,prange,kquad,prw,prt,pr) +implicit none + +integer(kind=jpim) ,intent(in) :: kn +real(kind=JPRBT),intent(in) :: prange +integer(kind=jpim) ,intent(in) :: kquad +real(kind=JPRBT),intent(out) :: prw(:) +real(kind=JPRBT),intent(out) :: prt(:) +real(kind=JPRBT),intent(out) :: pr + +real(kind=JPRBT) :: za,zb,zs +integer(kind=jpim) :: jm +!------------------------------------------------------------------------- + +za=1.0 +zb=500.0 +zs=zb/prange +pr=za/zs +call wts500(prt,prw,kquad) +do jm=1,kquad + prw(jm)=prw(jm)*zs + prt(jm)=prt(jm)*zs +enddo +end subroutine suquad +!========================================================================== + +recursive subroutine comb_xy(kx,px,ky,py,kxy,pxy,kindex) + +implicit none + +integer(kind=jpim), intent(in) :: kx,ky +real(kind=jprd), intent(in) :: px(:) +real(kind=JPRBT), intent(in) :: py(:) +integer(kind=jpim), intent(in) :: kxy +real(kind=JPRBT), intent(out) :: pxy(:) +integer(kind=jpim), intent(out) :: kindex(:) +integer(kind=jpim) :: jxy,ix,iy,iret + +!------------------------------------------------------------------------- + +pxy(1:kx)=px(1:kx) +pxy(kx+1:kx+ky)=py(1:ky) +!call m01daf(pxy,1,kxy,'D',irank,ifail) +call keysort(iret,pxy,kxy,descending=.true.,index=kindex,init=.true.) +!!$do jxy=1,kxy +!!$ kindex(irank(jxy))=jxy +!!$enddo + +end subroutine comb_xy +!========================================================================== +recursive subroutine prepotf(kx,kxy,kquad,prw,prt,pr,pxy,kindex,prdexp,& + & kclosel,pcik,knocik,pdiff) + +implicit none + +integer(kind=jpim), intent(in) :: kx +integer(kind=jpim), intent(in) :: kxy +integer(kind=jpim), intent(in) :: kquad +real(kind=JPRBT), intent(in) :: pxy(:) +real(kind=JPRBT), intent(in) :: prw(:) +real(kind=JPRBT), intent(in) :: pr +real(kind=JPRBT), intent(in) :: prt(:) +integer(kind=jpim), intent(in) :: kindex(:) +real(kind=JPRBT), intent(out) :: prdexp(:,:) +integer(kind=jpim), intent(out) :: kclosel(:) +real(kind=JPRBT), intent(out) :: pcik(:) +integer(kind=jpim), intent(out) :: knocik +real(kind=JPRBT),optional, intent(in) :: pdiff(:,:) + +real(kind=JPRBT) :: zdx +real(kind=JPRBT) :: zsum +real(kind=JPRBT) :: zdiff(kxy,kxy) +integer(kind=jpim) :: jxy,jq,isize,jdist,ixy,ixym1,i1,i1pd,j1,j2 +logical :: llexit +!------------------------------------------------------------------------- +if(present(pdiff)) then + zdiff(:,:)=pdiff(:,:) +else + do j1=1,kxy + do j2=1,kxy + zdiff(j1,j2)=pxy(j1)-pxy(j2) + enddo + enddo +endif +do jxy=2,kxy + ixy=kindex(jxy) + ixym1=kindex(jxy-1) + do jq=1,kquad + prdexp(jq,jxy)=exp(zdiff(ixy,ixym1)*prt(jq)) + enddo +enddo +kclosel(:)=0 +knocik=0 +isize=size(pcik) +llexit=.true. +do jxy=1,kxy-1 + do jdist=1,kxy-jxy + i1=kindex(jxy) + i1pd=kindex(jxy+jdist) + zdx=zdiff(i1,i1pd) + if(zdx < pr) then + llexit=.false. + kclosel(jxy)=kclosel(jxy)+1 + if((i1 > kx .and. i1pd <= kx) .or. (i1pd > kx .and. i1 <= kx)) then + knocik=knocik+1 + zsum=0.0_JPRBT + do jq=1,kquad + zsum=zsum+prw(jq)*exp(-zdx*prt(jq)) + enddo + pcik(knocik)=1.0_JPRBT/zdx-zsum + endif + else + exit + endif + enddo + if(knocik > isize) stop ' precompfint : pcik tto small' +enddo + +end subroutine prepotf +!========================================================================== + +end module seefmm_mix diff --git a/src/trans/gpu/algor/wts500_mod.F90 b/src/trans/gpu/algor/wts500_mod.F90 new file mode 100644 index 00000000..0859d78e --- /dev/null +++ b/src/trans/gpu/algor/wts500_mod.F90 @@ -0,0 +1,3764 @@ +! (C) Copyright 2014- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE WTS500_MOD +CONTAINS +SUBROUTINE WTS500(PX,PW,KN) + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT +IMPLICIT NONE + +INTEGER(KIND=JPIM), INTENT(IN) :: KN +REAL(KIND=JPRBT), INTENT(OUT) :: PX(:),PW(:) + +! This routine returns a set of Gaussian nodes and weights for +! integrating the functions exp(lambda*x)dx over the range x=0 to x=infinity. +! They work for lambda in the range [1,501]. The accuracy +! of the quadrature for each n is given in the tables below. + +! Input arguments: +! n - number of weights and nodes in the quadrature. This must +! be an integer in the range [2,56]. +! +! Output arguments: +! w - weights +! x - nodes +! +! +! The following table gives the approximate accuracy of the weights in +! this file, that is to say the experimentally determined maximum +! absolute error for lambda in the range [1,501]. +! +! 2 0.76126E-01 +! 3 0.26903E-01 +! 4 0.88758E-02 +! 5 0.28110E-02 +! 6 0.86785E-03 +! 7 0.26276E-03 +! 8 0.78346E-04 +! 9 0.23066E-04 +! 10 0.67184E-05 +! 11 0.19386E-05 +! 12 0.55482E-06 +! 13 0.15762E-06 +! 14 0.44478E-07 +! 15 0.12474E-07 +! 16 0.34787E-08 +! 17 0.96498E-09 +! 18 0.26636E-09 +! 19 0.73174E-10 +! 20 0.20013E-10 +! 21 0.54503E-11 +! 22 0.14783E-11 +! 23 0.39937E-12 +! 24 0.10749E-12 +! 25 0.28822E-13 +! 26 0.77011E-14 +! 27 0.20993E-14 +! 28 0.59593E-15 + +! (The accuracies beyond this point are +! only available if this routine is converted +! to extended precision.) +! +! 29 0.16665E-15 +! 30 0.45938E-16 +! 31 0.12483E-16 +! 32 0.33436E-17 +! 33 0.88209E-18 +! 34 0.22896E-18 +! 35 0.58363E-19 +! 36 0.15182E-19 +! 37 0.45892E-20 +! 38 0.13452E-20 +! 39 0.38384E-21 +! 40 0.10683E-21 +! 41 0.29025E-22 +! 42 0.76955E-23 +! 43 0.19878E-23 +! 44 0.49867E-24 +! 45 0.12879E-24 +! 46 0.38890E-25 +! 47 0.11493E-25 +! 48 0.32717E-26 +! 49 0.89977E-27 +! 50 0.23916E-27 +! 51 0.66534E-28 +! 52 0.20256E-28 +! 53 0.60754E-29 +! 54 0.17974E-29 +! 55 0.52173E-30 +! 56 0.14656E-30 +! 57 0.39867E-31 +! 58 0.17622E-31 +! 59 0.11941E-31 + + +if(kn < 1 .or. kn > 59) CALL ABOR1('kn out of bounds in wts500') + +if(kn == 1) then + px( 1)= 0.30029234138173323099658823269124393D+00 + pw( 1)= 0.10474544159373900054024730385996879D+01 +endif +if(kn == 2) then + px( 1)= 0.44614645646035084305052271657195780D-01 + px( 2)= 0.69921614559509068409005059560416520D+00 + pw( 1)= 0.15994862626671497398269903651565162D+00 + pw( 2)= 0.15511944041990193294522419186360192D+01 +endif +if(kn == 3) then + px( 1)= 0.11857389353662594950547350532174902D-01 + px( 2)= 0.16764835416208964726306668598724940D+00 + px( 3)= 0.11277491807394385305149473243132366D+01 + pw( 1)= 0.40534466810113107834140226328650886D-01 + pw( 2)= 0.36261372044374320167772351965467234D+00 + pw( 3)= 0.19347454032003660753035080254268649D+01 +endif +if(kn == 4) then + px( 1)= 0.57654208655188821571537226422468374D-02 + px( 2)= 0.62523167781181198280172417136126064D-01 + px( 3)= 0.36533207087496350173593536437526512D+00 + px( 4)= 0.16157524591822212021884702173778156D+01 + pw( 1)= 0.17868545000488806425866630797084082D-01 + pw( 2)= 0.12328668633600752343455555544669733D+00 + pw( 3)= 0.58372673164630844216911861958288950D+00 + pw( 4)= 0.22766170591050845697618451516268360D+01 +endif +if(kn == 5) then + px( 1)= 0.38286655100887720869535305553503767D-02 + px( 2)= 0.32638654131646742439488396483219523D-01 + px( 3)= 0.15979343932440245421190120326583297D+00 + px( 4)= 0.62593598592475461862138219614406608D+00 + px( 5)= 0.21448850159686373839369069493121931D+01 + pw( 1)= 0.11006282598632254556602552524231582D-01 + pw( 2)= 0.57230982733825644334586234468135637D-01 + pw( 3)= 0.23302193704059872350599634911774233D+00 + pw( 4)= 0.80399187922220639155300759119683454D+00 + pw( 5)= 0.25817835060095957471682462945418890D+01 +endif +if(kn == 6) then + px( 1)= 0.29149725976167556773168063131622477D-02 + px( 2)= 0.21103262424016434877304140537976324D-01 + px( 3)= 0.87672665103786085283260733993906787D-01 + px( 4)= 0.30362191434277672991331660483079722D+00 + px( 5)= 0.93772122523975475283401768866415608D+00 + px( 6)= 0.27034204049910086432949276908848690D+01 + pw( 1)= 0.80264082196721958922253358145693741D-02 + pw( 2)= 0.33206285080799577093824435206894388D-01 + pw( 3)= 0.11527314673621704717905199327206330D+00 + pw( 4)= 0.35781407343251622864391524621893261D+00 + pw( 5)= 0.10155006971456147012687855907386742D+01 + pw( 6)= 0.28564602273790248848620772701419901D+01 +endif +if(kn == 7) then + px( 1)= 0.23756434216797693908355583003051663D-02 + px( 2)= 0.15535118461800569190547935165674497D-01 + px( 3)= 0.56551273947723240042834250291153011D-01 + px( 4)= 0.17450996559452745580564075650691432D+00 + px( 5)= 0.49112500667928528178358415821262749D+00 + px( 6)= 0.12919504684055881399873474932878924D+01 + px( 7)= 0.32851412356973227675206901468229149D+01 + pw( 1)= 0.63886696350675163666711135442919764D-02 + pw( 2)= 0.22442242541224196499797804610442189D-01 + pw( 3)= 0.67162457789171790971598253749543852D-01 + pw( 4)= 0.18766490984256656291382113580254403D+00 + pw( 5)= 0.48993342911988760795594155472540210D+00 + pw( 6)= 0.12160105557427987424854903052204337D+01 + pw( 7)= 0.31065668529411483170389922159304207D+01 +endif +if(kn == 8) then + px( 1)= 0.20132180607225834852983700686031058D-02 + px( 2)= 0.12364145427770238507453563351438370D-01 + px( 3)= 0.40779487004322342225816753965428875D-01 + px( 4)= 0.11391148589791378330792648499651137D+00 + px( 5)= 0.29412883145944878287611616309747638D+00 + px( 6)= 0.71837098644914450185415022533227351D+00 + px( 7)= 0.16816853747148161279323645594606599D+01 + px( 8)= 0.38855951611162649511701947653116899D+01 + pw( 1)= 0.53408830450397378295721475271713427D-02 + pw( 2)= 0.16787984180357441921393643943808306D-01 + pw( 3)= 0.44204970292771876400860637563463140D-01 + pw( 4)= 0.11185011190895920238718302559774708D+00 + pw( 5)= 0.27016191944753418682586773667091217D+00 + pw( 6)= 0.62450892495416087531923817566640810D+00 + pw( 7)= 0.14051940584215133597891405916743820D+01 + pw( 8)= 0.33364764999199772405176908767598180D+01 +endif +if(kn == 9) then + px( 1)= 0.17503557878075214519879036524833375D-02 + px( 2)= 0.10330054871723184754037951908660005D-01 + px( 3)= 0.31732206325445549848937295392421049D-01 + px( 4)= 0.81679121803401719155156637120993002D-01 + px( 5)= 0.19535573613157974971806765507983509D+00 + px( 6)= 0.44595223788074039354969352640100852D+00 + px( 7)= 0.98135220945386835905397405725519927D+00 + px( 8)= 0.21015170770020791654126380210416672D+01 + px( 9)= 0.45016099625796459085057556421321067D+01 + pw( 1)= 0.46046107862322611600702922635137510D-02 + pw( 2)= 0.13429769279555420282230890098322098D-01 + pw( 3)= 0.31849272894061216190385847667286999D-01 + pw( 4)= 0.73619249380405585299506449662883408D-01 + pw( 5)= 0.16535947795274881422091783969932427D+00 + pw( 6)= 0.35934417872243713158616629815331565D+00 + pw( 7)= 0.75859660458155596407004967675485659D+00 + pw( 8)= 0.15835611580085742332645675327785911D+01 + pw( 9)= 0.35495334121390922353050388191171761D+01 +endif +if(kn == 10) then + px( 1)= 0.15499542669794147311761132610985078D-02 + px( 2)= 0.89096688398509678030924958952986541D-02 + px( 3)= 0.26016129693043413683698040760210971D-01 + px( 4)= 0.62749208598031754292711006026615926D-01 + px( 5)= 0.14050353407849426047312501552279980D+00 + px( 6)= 0.30181414123622395598472969521213967D+00 + px( 7)= 0.62857185766887058395879604119303521D+00 + px( 8)= 0.12763299399934852402803633611287669D+01 + px( 9)= 0.25471652578226896546390447678170800D+01 + px(10)= 0.51308067782658204682801321291953140D+01 + pw( 1)= 0.40548253986991389304086648382669812D-02 + pw( 2)= 0.11236188731318700188319947717229244D-01 + pw( 3)= 0.24538592970816425402207400802497825D-01 + pw( 4)= 0.52343517158460444258926972420783323D-01 + pw( 5)= 0.10996558531241373841512937168184451D+00 + pw( 6)= 0.22576984249893556435278829920283551D+00 + pw( 7)= 0.45261772769497660240184928082408646D+00 + pw( 8)= 0.89046194530658792052229355508350808D+00 + pw( 9)= 0.17519181692872931385803490555209237D+01 + pw(10)= 0.37483184442175079452559119390314762D+01 +endif +if(kn == 11) then + px( 1)= 0.13916503869954620980214122938112018D-02 + px( 2)= 0.78558050039620362116422912802704097D-02 + px( 3)= 0.22120351963459063625179184464273817D-01 + px( 4)= 0.50713242013627782652925282235350304D-01 + px( 5)= 0.10742319887758113859279057980251631D+00 + px( 6)= 0.21869117548117680828718663672766937D+00 + px( 7)= 0.43336030838054496927092430378951163D+00 + px( 8)= 0.84018288532692476091948497009100848D+00 + px( 9)= 0.15999462842086233335871841089805130D+01 + px(10)= 0.30151891304900922172275251687511795D+01 + px(11)= 0.57713337136325357659044166408988553D+01 + pw( 1)= 0.36266510989460802397873164288690734D-02 + pw( 2)= 0.96950369134995745371692684658412310D-02 + pw( 3)= 0.19868559916434820960232503677049332D-01 + pw( 4)= 0.39531505533000821496522977944503206D-01 + pw( 5)= 0.78139977968766268496958310477317134D-01 + pw( 6)= 0.15225813478813020210433348166511355D+00 + pw( 7)= 0.29137456345793780649576074277273596D+00 + pw( 8)= 0.54807495314077461646456170049418256D+00 + pw( 9)= 0.10191215020827390969142637205874466D+01 + pw(10)= 0.19111370110068844260874568369820828D+01 + pw(11)= 0.39348541670555911675625708842098543D+01 +endif +if(kn == 12) then + px( 1)= 0.12632156319939695579215561432558385D-02 + px( 2)= 0.70387146180879286642550585568639377D-02 + px( 3)= 0.19302659916972801592515201957656444D-01 + px( 4)= 0.42551194105513329489354627472007290D-01 + px( 5)= 0.86085992365439625679560348707191417D-01 + px( 6)= 0.16725813918676372888922850902058409D+00 + px( 7)= 0.31700589361943171070304767229555238D+00 + px( 8)= 0.58951191619791589683106843254408061D+00 + px( 9)= 0.10788413480839543106858840025589018D+01 + px(10)= 0.19492439011651360999619060490611511D+01 + px(11)= 0.35027911449093405858739078699931569D+01 + px(12)= 0.64217203451802643126959421992550844D+01 + pw( 1)= 0.32827550127604954815335522044868924D-02 + pw( 2)= 0.85507540197432347733759971108613029D-02 + pw( 3)= 0.16691067543209908975186757261638663D-01 + pw( 4)= 0.31305443669027793014029811945580099D-01 + pw( 5)= 0.58567896813505682343320309424861089D-01 + pw( 6)= 0.10877780693120356209434163632243069D+00 + pw( 7)= 0.19948368504455875259042629374456601D+00 + pw( 8)= 0.36073967408442558747147434040847228D+00 + pw( 9)= 0.64434179114194185133722660716595021D+00 + pw(10)= 0.11440591869200469793282529105709381D+01 + pw(11)= 0.20620613323447639907885178297356148D+01 + pw(12)= 0.41107547758236817466309181956604953D+01 +endif +if(kn == 13) then + px( 1)= 0.11568070520917003084172899381406952D-02 + px( 2)= 0.63841031288591798815234066736007959D-02 + px( 3)= 0.17167711063873432612672836118321923D-01 + px( 4)= 0.36716019315047555758945584779212588D-01 + px( 5)= 0.71538434902470635056036514447143755D-01 + px( 6)= 0.13352689214829569833793677431883262D+00 + px( 7)= 0.24327491638178311179091055382048232D+00 + px( 8)= 0.43570388054574854249210127119178039D+00 + px( 9)= 0.76944422789933215499607649746035088D+00 + px(10)= 0.13426076062689412695687622546331410D+01 + px(11)= 0.23216397056754496795639041380757885D+01 + px(12)= 0.40076692644610800412009568941378368D+01 + px(13)= 0.70807767026725092855482684561044825D+01 + pw( 1)= 0.29999461201538405335426373656007144D-02 + pw( 2)= 0.76647148893919466896210631066410433D-02 + pw( 3)= 0.14413099397767107036014918126327008D-01 + pw( 4)= 0.25735254465096214146045248201904623D-01 + pw( 5)= 0.45838911095175772187879197848974379D-01 + pw( 6)= 0.81479609103017647524386799211775902D-01 + pw( 7)= 0.14368582127715272775954689518716653D+00 + pw( 8)= 0.25069362938769053221329919008840415D+00 + pw( 9)= 0.43269827884396605348016529885550754D+00 + pw(10)= 0.74044407474304235266031833325121847D+00 + pw(11)= 0.12650477516195139055095418091901788D+01 + pw(12)= 0.22054684388256755415929800672572855D+01 + pw(13)= 0.42773285665753897495906588616832298D+01 +endif +if(kn == 14) then + px( 1)= 0.10671394835726993643401190254501129D-02 + px( 2)= 0.58463219387999205854721615603871185D-02 + px( 3)= 0.15490001691602291418567227600617406D-01 + px( 4)= 0.32358969391919711170669944447127038D-01 + px( 5)= 0.61149820125492142314645657324953302D-01 + px( 6)= 0.11030358671474348741774495105493271D+00 + px( 7)= 0.19410127886489828338777312891458955D+00 + px( 8)= 0.33608975279087339935499749076365196D+00 + px( 9)= 0.57467416136431217184690034730315880D+00 + px(10)= 0.97213677138209888520957230352753264D+00 + px(11)= 0.16296229550658990298708599801796051D+01 + px(12)= 0.27148835813646440924906285656041458D+01 + px(13)= 0.45279078084008377593720415669623160D+01 + px(14)= 0.77475240501693856896302484250295344D+01 + pw( 1)= 0.27629693325443496023249306681806622D-02 + pw( 2)= 0.69561289060164080762372517703735452D-02 + pw( 3)= 0.12707947219219552524391040926686251D-01 + pw( 4)= 0.21789865957161394780641970526254899D-01 + pw( 5)= 0.37165450147146819315861454426090832D-01 + pw( 6)= 0.63467023298815675308212729382023282D-01 + pw( 7)= 0.10796235746711791223910582849233129D+00 + pw( 8)= 0.18227038265561301471910246064953376D+00 + pw( 9)= 0.30504176678107803021400309104091856D+00 + pw(10)= 0.50632019264974725562617387788020652D+00 + pw(11)= 0.83570187185256971962451509332190849D+00 + pw(12)= 0.13820370228408991881296397708221031D+01 + pw(13)= 0.23420578929692185520173441027903024D+01 + pw(14)= 0.44356505634271121964311698817472400D+01 +endif +if(kn == 15) then + px( 1)= 0.99051088742239546911827900108699496D-03 + px( 2)= 0.53956859621998690434579925957171424D-02 + px( 3)= 0.14133099949775560668841655685250135D-01 + px( 4)= 0.28986898223070841544449976160392634D-01 + px( 5)= 0.53435705913102094881195404152789995D-01 + px( 6)= 0.93645206162765297669714244403422362D-01 + px( 7)= 0.15985934779166946137952895657247365D+00 + px( 8)= 0.26857062154409215737173322640809941D+00 + px( 9)= 0.44599642616020670957476031778753703D+00 + px(10)= 0.73354905454082877060570463263769745D+00 + px(11)= 0.11964718339441153959477756602491571D+01 + px(12)= 0.19381471569679916567453627291281352D+01 + px(13)= 0.31270149297417650854875367303912466D+01 + px(14)= 0.50618963332858424622137075227057629D+01 + px(15)= 0.84211464685237105475663901928632362D+01 + pw( 1)= 0.25613355491269301472247025631698555D-02 + pw( 2)= 0.63749573647622919143443050089843640D-02 + pw( 3)= 0.11385424145275303734569902166128714D-01 + pw( 4)= 0.18885541036270664981042580089520502D-01 + pw( 5)= 0.31017529676997219516938356388920347D-01 + pw( 6)= 0.51073868255802230619626900228351940D-01 + pw( 7)= 0.84046732560351404937544837831163072D-01 + pw( 8)= 0.13765675735555740692176543168654955D+00 + pw( 9)= 0.22395807420699105358370420342497012D+00 + pw(10)= 0.36179555862328041144887460703035761D+00 + pw(11)= 0.58087464283264030346041192108355511D+00 + pw(12)= 0.92964960101952292137644303560667260D+00 + pw(13)= 0.14950837580230651438254975578030658D+01 + pw(14)= 0.24724522131384988410164432454754806D+01 + pw(15)= 0.45866150536692880851084073088444439D+01 +endif +if(kn == 16) then + px( 1)= 0.92424556053971804395203579030996377D-03 + px( 2)= 0.50119857035276309167085959684639920D-02 + px( 3)= 0.13010148344461010136325444162584406D-01 + px( 4)= 0.26298847872231801244580127340702382D-01 + px( 5)= 0.47514597563807100058998705367026609D-01 + px( 6)= 0.81270265524596050675960333331275375D-01 + px( 7)= 0.13512212854897371400148043770796492D+00 + px( 8)= 0.22099831669763609789452208029307415D+00 + px( 9)= 0.35743533198723356545740999174337465D+00 + px(10)= 0.57303934564748771037392604303226621D+00 + px(11)= 0.91178803183594107838418321994075593D+00 + px(12)= 0.14412990877209031034459072051939979D+01 + px(13)= 0.22665732295803028877227882175486339D+01 + px(14)= 0.35563219327028564804732957321586739D+01 + px(15)= 0.56082683107347449605789692452752341D+01 + px(16)= 0.91009557053769985645449386780285663D+01 + pw( 1)= 0.23875730074587121522484947794868875D-02 + pw( 2)= 0.58885836354644078094393546205524519D-02 + pw( 3)= 0.10329316404953667658388475423521547D-01 + pw( 4)= 0.16675469828365142363971545189694061D-01 + pw( 5)= 0.26509288186703737427234519675535818D-01 + pw( 6)= 0.42238486164148838677474311842225533D-01 + pw( 7)= 0.67416914217096425505948847545251978D-01 + pw( 8)= 0.10736855475828194921843388490206803D+00 + pw( 9)= 0.17018723045468088538746817095795499D+00 + pw(10)= 0.26821766647062034548841613907117539D+00 + pw(11)= 0.42033354906191972945041277677914052D+00 + pw(12)= 0.65579398374758854246146234684717478D+00 + pw(13)= 0.10219768938841077866955934226960186D+01 + pw(14)= 0.16043073953579345130225854215735434D+01 + pw(15)= 0.25972025694404090887006269510370113D+01 + pw(16)= 0.47309741605753804978373532621330190D+01 +endif +if(kn == 17) then + px( 1)= 0.86635871969021922697368620380719171D-03 + px( 2)= 0.46809392638055264395521402125297454D-02 + px( 3)= 0.12063365169996780691336553708488918D-01 + px( 4)= 0.24103101327380368111361062755741909D-01 + px( 5)= 0.42839783531484774994275135488102184D-01 + px( 6)= 0.71796215026314425758110076991702409D-01 + px( 7)= 0.11667729790867741765510249018437646D+00 + px( 8)= 0.18634666322578944021297170782701070D+00 + px( 9)= 0.29431002802504656436686613469103506D+00 + px(10)= 0.46097947485749393786168585286436753D+00 + px(11)= 0.71707474488613285211211458276745483D+00 + px(12)= 0.11087399814864640384073502346191564D+01 + px(13)= 0.17054769509601451753469079240400202D+01 + px(14)= 0.26134294141755383267430920003219774D+01 + px(15)= 0.40013054555328633047267069424482020D+01 + px(16)= 0.61658542528138849120122933226961818D+01 + px(17)= 0.97863651496368699835621922821544266D+01 + pw( 1)= 0.22362031087732985183223720565899613D-02 + pw( 2)= 0.54748197495181657019575138084577960D-02 + pw( 3)= 0.94654762542541339323585186529928706D-02 + pw( 4)= 0.14944978294023789536885689508849665D-01 + pw( 5)= 0.23104141337215046610391678093559219D-01 + pw( 6)= 0.35743858431643263735692324511510319D-01 + pw( 7)= 0.55471485911701630593769305151799996D-01 + pw( 8)= 0.86081253422553517325473099895533971D-01 + pw( 9)= 0.13319208512009976788628468120231006D+00 + pw(10)= 0.20518383340535621745997474262468218D+00 + pw(11)= 0.31457026902282458163459435604989060D+00 + pw(12)= 0.48013605826872593355323421318271994D+00 + pw(13)= 0.73064164224806746286555643417201840D+00 + pw(14)= 0.11124853152432080885402590631214998D+01 + pw(15)= 0.17098620915289443753971369096340138D+01 + pw(16)= 0.27167962114974631513284929060677873D+01 + pw(17)= 0.48693666605809874327613580765223186D+01 +endif +if(kn == 18) then + px( 1)= 0.81534546944719247652148851160350705D-03 + px( 2)= 0.43921383032957290852797835084086745D-02 + px( 3)= 0.11252831287113979876218879126262041D-01 + px( 4)= 0.22272809918423842477230910840641111D-01 + px( 5)= 0.39059190621733757694717358109890744D-01 + px( 6)= 0.64351345855435130240707418934923649D-01 + px( 7)= 0.10254178168021558618169365620644254D+00 + px( 8)= 0.16036912239056014178785088963204099D+00 + px( 9)= 0.24793085437822483127871211432445117D+00 + px(10)= 0.38020606680363380296594904808373420D+00 + px(11)= 0.57931198537370336370768123946111904D+00 + px(12)= 0.87782030094596229067122251201903469D+00 + px(13)= 0.13236886505899049407415999004449793D+01 + px(14)= 0.19878981731926587403713993880950166D+01 + px(15)= 0.29773742063276571130424960439239811D+01 + px(16)= 0.44606479473592453417809733684642453D+01 + px(17)= 0.67336454736569532214104241445939923D+01 + px(18)= 0.10476870213785278025831291196419849D+02 + pw( 1)= 0.21031106134613254745224559686561387D-02 + pw( 2)= 0.51180225478690087298803997939678307D-02 + pw( 3)= 0.87447370935970058836913883371899363D-02 + pw( 4)= 0.13556333730223126136121123231259769D-01 + pw( 5)= 0.20464410216338036789779126385561199D-01 + pw( 6)= 0.30840826466947410241261173555971053D-01 + pw( 7)= 0.46646725563835069845910497771424517D-01 + pw( 8)= 0.70665992632893967524146851106623791D-01 + pw( 9)= 0.10691711075454395961181073477729829D+00 + pw(10)= 0.16126547210903485792084905962898804D+00 + pw(11)= 0.24229509616610255685205188073761953D+00 + pw(12)= 0.36259234617456851620630123413473169D+00 + pw(13)= 0.54077318770758097024862643911836327D+00 + pw(14)= 0.80508517291294611337977692389747849D+00 + pw(15)= 0.12010568605667241321654935386099247D+01 + pw(16)= 0.18119190754724382445935517697030275D+01 + pw(17)= 0.28316640971192016310500153967949394D+01 + pw(18)= 0.50023398513295527439214736503709354D+01 +endif +if(kn == 19) then + px( 1)= 0.77004314038027538186473565824644713D-03 + px( 2)= 0.41377999330050882914315571254404182D-02 + px( 3)= 0.10550059721986121680218103475882631D-01 + px( 4)= 0.20721165151890931602860316370210977D-01 + px( 5)= 0.35938464219963308088025295472146865D-01 + px( 6)= 0.58366846240533488939212086566679739D-01 + px( 7)= 0.91445878837421954880767431790624735D-01 + px( 8)= 0.14039874091074934933241224785817280D+00 + px( 9)= 0.21294392165350590964714225395783962D+00 + px(10)= 0.32034449388214261059174416345147381D+00 + px(11)= 0.47894630556295055579310759106626288D+00 + px(12)= 0.71240272881825772752142518345001522D+00 + px(13)= 0.10548945187200170944433178271823931D+01 + px(14)= 0.15558853016268620673915640887496031D+01 + px(15)= 0.22875048294126069831169385958820912D+01 + px(16)= 0.33571879025245227476638520498261293D+01 + px(17)= 0.49331870217088227943005162552018204D+01 + px(18)= 0.73107657529555946558412435928344152D+01 + px(19)= 0.11172033293322081236666941378477642D+02 + pw( 1)= 0.19851404434292128161572603749012797D-02 + pw( 2)= 0.48068259850224485883558544189622861D-02 + pw( 3)= 0.81333622551403917938013382271885652D-02 + pw( 4)= 0.12418290368147926797125266347472059D-01 + pw( 5)= 0.18370520996399132266929646675963512D-01 + pw( 6)= 0.27051260748772011764068846294046263D-01 + pw( 7)= 0.39965808856363914121506612420685851D-01 + pw( 8)= 0.59209186306872358867005896800202963D-01 + pw( 9)= 0.87733163743117132114765866319084195D-01 + pw(10)= 0.12975590467993236932824761605587432D+00 + pw(11)= 0.19133746801777096221136436300603368D+00 + pw(12)= 0.28119521004597734924657796391935552D+00 + pw(13)= 0.41191451657988898367993353735036478D+00 + pw(14)= 0.60189235614006428503841831846326255D+00 + pw(15)= 0.87887424161089233540150542700936886D+00 + pw(16)= 0.12876310724220064351766502884115249D+01 + pw(17)= 0.19106555786053829247748848866287425D+01 + pw(18)= 0.29421880423760456779000496029582960D+01 + pw(19)= 0.51303663807899157605725316747503964D+01 +endif +if(kn == 20) then + px( 1)= 0.72953870581824875898426590899779881D-03 + px( 2)= 0.39119765013576279090594732249736674D-02 + px( 3)= 0.99341482544177994609830844666447236D-02 + px( 4)= 0.19386948343749964006247704258211447D-01 + px( 5)= 0.33316937960159820123580449168857215D-01 + px( 6)= 0.53460240758464289437871006955420897D-01 + px( 7)= 0.82550373662718413944945968267436420D-01 + px( 8)= 0.12470333365766989270462564119778831D+00 + px( 9)= 0.18593248929460522396945432673813683D+00 + px(10)= 0.27488824477709750979845021236505632D+00 + px(11)= 0.40393075219294011341163041535306531D+00 + px(12)= 0.59066371193107531104946391158049526D+00 + px(13)= 0.86011311520208939992657730367961144D+00 + px(14)= 0.12478475018108315971898483004517581D+01 + px(15)= 0.18045718085352853467882826664457563D+01 + px(16)= 0.26032962838705539076929338309800964D+01 + px(17)= 0.37517626775845063477806691210141684D+01 + px(18)= 0.54178931704948062041809621881307184D+01 + px(19)= 0.78964489442873309920085649476831128D+01 + px(20)= 0.11871472067202549660846380067126100D+02 + pw( 1)= 0.18798307004678227911126701301382902D-02 + pw( 2)= 0.45327579066213798867372241143837770D-02 + pw( 3)= 0.76074915984009747299662992379957293D-02 + pw( 4)= 0.11468617772837279819666929117999041D-01 + pw( 5)= 0.16675567273387276975224875991907841D-01 + pw( 6)= 0.24060338209931569028899923305997740D-01 + pw( 7)= 0.34797589593340192429611675508702300D-01 + pw( 8)= 0.50498483539096564292114787807163509D-01 + pw( 9)= 0.73383972617080340693094946803759227D-01 + pw(10)= 0.10656241858153477344295213679085888D+00 + pw(11)= 0.15442089967699606004539206241769081D+00 + pw(12)= 0.22316517325789412150938929616587900D+00 + pw(13)= 0.32158746804288047814456613183637546D+00 + pw(14)= 0.46221802166864090447194287669501892D+00 + pw(15)= 0.66320650916245908698703462902787156D+00 + pw(16)= 0.95182294118921990028099509733606027D+00 + pw(17)= 0.13721884223731472188837264029436159D+01 + pw(18)= 0.20062479833990429444286070644467212D+01 + pw(19)= 0.30487071357319501782576369389563967D+01 + pw(20)= 0.52538573669990067214113994100342659D+01 +endif +if(kn == 21) then + px( 1)= 0.69310486169589490054767779485652272D-03 + px( 2)= 0.37100374076510691098395748526914731D-02 + px( 3)= 0.93893881986074420164580675137225674D-02 + px( 4)= 0.18225818290917406339248365551994452D-01 + px( 5)= 0.31081498488793808069000194256763116D-01 + px( 6)= 0.49367420729419602783748969736699563D-01 + px( 7)= 0.75284896750519291461000850881359128D-01 + px( 8)= 0.11212409497044480168252045403791321D+00 + px( 9)= 0.16464732212808249809258249926002432D+00 + px(10)= 0.23962225781352520103022311097593359D+00 + px(11)= 0.34658518012264822513642471749928831D+00 + px(12)= 0.49892252217405192191396093341852149D+00 + px(13)= 0.71538674271382417280638826801508549D+00 + px(14)= 0.10222215706608203725632212510312547D+01 + px(15)= 0.14561846864798943048112298539958500D+01 + px(16)= 0.20689966860645877444413948503855558D+01 + px(17)= 0.29343325491322992822962883455667422D+01 + px(18)= 0.41600923418527605289681744860608923D+01 + px(19)= 0.59138510177367345119623187111536027D+01 + px(20)= 0.84900211059998571872908583995965399D+01 + px(21)= 0.12574850278864023350809778583936857D+02 + pw( 1)= 0.17852307235103793959959450796716884D-02 + pw( 2)= 0.42893635684453385047417019848134615D-02 + pw( 3)= 0.71497939239830042802199211798008382D-02 + pw( 4)= 0.10663735418026519276123538929713931D-01 + pw( 5)= 0.15278738558420813530874306732239620D-01 + pw( 6)= 0.21655078725685226823011775486802689D-01 + pw( 7)= 0.30722016510142553890174563226245394D-01 + pw( 8)= 0.43741403514742541630405757195910036D-01 + pw( 9)= 0.62420477800170703110133642765679935D-01 + pw(10)= 0.89101011801730193914410815797468243D-01 + pw(11)= 0.12703103910528559102170033988553577D+00 + pw(12)= 0.18073427743399167030424190804804963D+00 + pw(13)= 0.25651888703698665673377349665161239D+00 + pw(14)= 0.36320523524230891449268691236436560D+00 + pw(15)= 0.51323006313409951821184870857512202D+00 + pw(16)= 0.72448352105440485035350350605592673D+00 + pw(17)= 0.10237957354057707173113819962280747D+01 + pw(18)= 0.14547382360170350806458262214201030D+01 + pw(19)= 0.20988676887781728328899280598296916D+01 + pw(20)= 0.31515233601433750723678379202457850D+01 + pw(21)= 0.53731727475008532237843822389416189D+01 +endif +if(kn == 22) then + px( 1)= 0.66015487793114579672026064530843580D-03 + px( 2)= 0.35283188763257655749934705032913356D-02 + px( 3)= 0.89037268484852140335335192775132156D-02 + px( 4)= 0.17204887313904086178312640698870646D-01 + px( 5)= 0.29150571838447577747778236163957035D-01 + px( 6)= 0.45901615793714000736387715594203021D-01 + px( 7)= 0.69252127065257752538232461849428663D-01 + px( 8)= 0.10186530460793644200891559033979032D+00 + px( 9)= 0.14756646191534939234956638050262194D+00 + px(10)= 0.21173627302990231446602872154763540D+00 + px(11)= 0.30186392834135377102923595739844832D+00 + px(12)= 0.42832456502483048109845734510065518D+00 + px(13)= 0.60545815580026546162531862085612489D+00 + px(14)= 0.85305900599439063193443527117705937D+00 + px(15)= 0.11984444149249300732981072917233551D+01 + px(16)= 0.16793849380767164321266153811733636D+01 + px(17)= 0.23484259421990250059871794075320909D+01 + px(18)= 0.32797346839031417791643463913803933D+01 + px(19)= 0.45812624093642886254220734381797637D+01 + px(20)= 0.64202435545325521457505355440378480D+01 + px(21)= 0.90908861089411744472574137099194608D+01 + px(22)= 0.13281870386631548366752315803495071D+02 + pw( 1)= 0.16997739815079876291270142303147830D-02 + pw( 2)= 0.40716314869569174969368080298995427D-02 + pw( 3)= 0.67473810622389953786984914722964259D-02 + pw( 4)= 0.99723772588960829502246883384624672D-02 + pw( 5)= 0.14109253034463888804804922921072868D-01 + pw( 6)= 0.19687945390172770224722572299963933D-01 + pw( 7)= 0.27452171417575159428955066586690985D-01 + pw( 8)= 0.38405330139190366155267988240486796D-01 + pw( 9)= 0.53884692766803369761362767720533598D-01 + pw(10)= 0.75690346610129349832179626694927492D-01 + pw(11)= 0.10627642236395592176596973365055862D+00 + pw(12)= 0.14901040431853077087285757833822327D+00 + pw(13)= 0.20852197820697925399881980495903384D+00 + pw(14)= 0.29118491216948479495401093995994782D+00 + pw(15)= 0.40581134430662519733362245846705041D+00 + pw(16)= 0.56471875154845478179304492458155087D+00 + pw(17)= 0.78553695828755046999062010778990119D+00 + pw(18)= 0.10946963582465286266450172551190923D+01 + pw(19)= 0.15353099122697920414083249854348604D+01 + pw(20)= 0.21886787280255501097880613119022761D+01 + pw(21)= 0.32509064591655802312953073550183727D+01 + pw(22)= 0.54886295320492726768301302193581765D+01 +endif +if(kn == 23) then + px( 1)= 0.63021006363433841831474149369938629D-03 + px( 2)= 0.33638809698722088589479529444503357D-02 + px( 3)= 0.84677484260852652320615476282127309D-02 + px( 4)= 0.16299243886097943905725295278638059D-01 + px( 5)= 0.27464030423556247673984741054128575D-01 + px( 6)= 0.42927866788160825736932098072471507D-01 + px( 7)= 0.64169139230662564082303838471065529D-01 + px( 8)= 0.93367767657860048911427490092659179D-01 + px( 9)= 0.13363456957733254533882679272072078D+00 + px(10)= 0.18930799757692553449088970134122038D+00 + px(11)= 0.26636229253087365091814709204751323D+00 + px(12)= 0.37297580801326442775998852633706569D+00 + px(13)= 0.52031366792206514283148817023645741D+00 + px(14)= 0.72359600291057382682653618452186879D+00 + px(15)= 0.10035561979187412137041345167147008D+01 + px(16)= 0.13884527722049286720427614547615987D+01 + px(17)= 0.19169141925914051220031027488430615D+01 + px(18)= 0.26421501677657864516298531009004741D+01 + px(19)= 0.36386833380196488331409942198293287D+01 + px(20)= 0.50144408008467284169777642211293427D+01 + px(21)= 0.69363388633219643316723260573935269D+01 + px(22)= 0.96985139442985089392873665294609340D+01 + px(23)= 0.13992267642385099650757505185917464D+02 + pw( 1)= 0.16221873306891233611184498841470861D-02 + pw( 2)= 0.38756062368028647892929037543456805D-02 + pw( 3)= 0.63904681228155448198104664799116312D-02 + pw( 4)= 0.93716130533642383510190970859726025D-02 + pw( 5)= 0.13116359133127231514779092120465823D-01 + pw( 6)= 0.18054564644654175010476006376716577D-01 + pw( 7)= 0.24787545528949442975874855915817576D-01 + pw( 8)= 0.34123347100699979880664417166869481D-01 + pw( 9)= 0.47126644724531756094751076593005203D-01 + pw(10)= 0.65206569654682359265380728357008572D-01 + pw(11)= 0.90252198579742822099518198813744744D-01 + pw(12)= 0.12481812207643272140418893825548652D+00 + pw(13)= 0.17237035227520038668196439463328275D+00 + pw(14)= 0.23761696881618806722148067227743494D+00 + pw(15)= 0.32696691109252817236452507243569941D+00 + pw(16)= 0.44919654054160466696484020754729982D+00 + pw(17)= 0.61648810596906235080169489216474494D+00 + pw(18)= 0.84621818814094735945360350883771170D+00 + pw(19)= 0.11644590806092632658493343351752018D+01 + pw(20)= 0.16139465349437736570093587472680170D+01 + pw(21)= 0.22758365152455412423280162288521126D+01 + pw(22)= 0.33470981208584956190993329351906399D+01 + pw(23)= 0.56005084488693785143169408072395655D+01 +endif +if(kn == 24) then + px( 1)= 0.60287590220675965883655392550708199D-03 + px( 2)= 0.32143351024974124786544026559297415D-02 + px( 3)= 0.80739802149419819753399469823810803D-02 + px( 4)= 0.15489662347070758196056101724102825D-01 + px( 5)= 0.25976667055963881164895151817795199D-01 + px( 6)= 0.40346714011175627204924600258024553D-01 + px( 7)= 0.59830440001259694739569609351980817D-01 + px( 8)= 0.86230308264184025703796753562250373D-01 + px( 9)= 0.12210371973165121163051429982451776D+00 + px(10)= 0.17099203191550984584070639962768156D+00 + px(11)= 0.23772719534691312495205955287685848D+00 + px(12)= 0.32885366959546582684337055007155510D+00 + px(13)= 0.45320556860130100192674333119232671D+00 + px(14)= 0.62268761188094067470211791701931515D+00 + px(15)= 0.85332750333845602641945072572045710D+00 + px(16)= 0.11667004858019451293051164929872329D+01 + px(17)= 0.15918861333740258915421115101398109D+01 + px(18)= 0.21682356132478165458928387387641436D+01 + px(19)= 0.29494889123245918407695084553095216D+01 + px(20)= 0.40104161879498382932676206134824540D+01 + px(21)= 0.54588693247348625992863433295378994D+01 + px(22)= 0.74614789122023638202247166496000371D+01 + px(23)= 0.10312431148373425365575271866282924D+02 + px(24)= 0.14705805275028081231659123827294642D+02 + pw( 1)= 0.15514249847644583248110080072115634D-02 + pw( 2)= 0.36981205611282502501592938773909080D-02 + pw( 3)= 0.60714888966684178001398442739385021D-02 + pw( 4)= 0.88442862964874279166007983229029862D-02 + pw( 5)= 0.12262956109957475661400953934190561D-01 + pw( 6)= 0.16679687959122678642462992762851947D-01 + pw( 7)= 0.22585160392521754760445705422367202D-01 + pw( 8)= 0.30637073979795749450832614038637313D-01 + pw( 9)= 0.41694955770756579068288981525602823D-01 + pw(10)= 0.56880053645721037701497821632682136D-01 + pw(11)= 0.77671508906845583575064408093070968D-01 + pw(12)= 0.10604093207274572585305524523636662D+00 + pw(13)= 0.14462967696642788409146183249874326D+00 + pw(14)= 0.19698231180562617603349065754159954D+00 + pw(15)= 0.26786134570104304270817950330971854D+00 + pw(16)= 0.36368625547483729027913381140943346D+00 + pw(17)= 0.49317740444788268405884668163369343D+00 + pw(18)= 0.66837334895456593354822252636195142D+00 + pw(19)= 0.90640972513509685558875644594827397D+00 + pw(20)= 0.12330418540439459719440490731353612D+01 + pw(21)= 0.16907002256383886184087957665579435D+01 + pw(22)= 0.23604873140910405927871490349730829D+01 + pw(23)= 0.34403155641323731993571442081042100D+01 + pw(24)= 0.57090593461669661084141390492124683D+01 +endif +if(kn == 25) then + px( 1)= 0.57782426626576571901902524785967824D-03 + px( 2)= 0.30777191085612843075800020147563616D-02 + px( 3)= 0.77164086160794063370485387996841129D-02 + px( 4)= 0.14761056704626219657716466804157254D-01 + px( 5)= 0.24653875400014389680962462419049193D-01 + px( 6)= 0.38083523913823039614894844760656273D-01 + px( 7)= 0.56084067240729530165951958613013393D-01 + px( 8)= 0.80159727346580499135438064461924847D-01 + px( 9)= 0.11243339120217788148662085401543468D+00 + px(10)= 0.15582715928119113928223955420169306D+00 + px(11)= 0.21429698577498381337290048538797344D+00 + px(12)= 0.29315054947238081211765761052065623D+00 + px(13)= 0.39947904356246012931942329550682875D+00 + px(14)= 0.54273750890918728697108592749256418D+00 + px(15)= 0.73551917539468211697041027264605642D+00 + px(16)= 0.99458876650351865510403034983347981D+00 + px(17)= 0.13422725867611823095435983972364716D+01 + px(18)= 0.18083631354696402035562158086478313D+01 + px(19)= 0.24328170422327489041615980739248420D+01 + px(20)= 0.32697931199298648820663632485790012D+01 + px(21)= 0.43942247576362161609644700980578340D+01 + px(22)= 0.59138559751958628054368604601307713D+01 + px(23)= 0.79950700685028878658475844781145359D+01 + px(24)= 0.10932212901976999668719864628273880D+02 + px(25)= 0.15422270538187376197239585881075748D+02 + pw( 1)= 0.14866196466384881027031457863846341D-02 + pw( 2)= 0.35366057944120495831689126436344645D-02 + pw( 3)= 0.57844969402891304739165399375745324D-02 + pw( 4)= 0.83773253924994338826352544649181741D-02 + pw( 5)= 0.11521432119169876731207250699755894D-01 + pw( 6)= 0.15508119705558278186014992003540867D-01 + pw( 7)= 0.20741213416907048628693122469214068D-01 + pw( 8)= 0.27760968610727861118254402368893756D-01 + pw( 9)= 0.37269537287121025856202203197461521D-01 + pw(10)= 0.50172098341486540969354039874345299D-01 + pw(11)= 0.67645050740466106776500656782887390D-01 + pw(12)= 0.91234375023821592290664320838692913D-01 + pw(13)= 0.12298569754451901426257783074280505D+00 + pw(14)= 0.16561310646694880188580709432419945D+00 + pw(15)= 0.22272153335996509389159575477724314D+00 + pw(16)= 0.29910756989135904284632341805878551D+00 + pw(17)= 0.40118169825940023204127855629976398D+00 + pw(18)= 0.53759403804464314015326297165302822D+00 + pw(19)= 0.72023661788993173698912869326221741D+00 + pw(20)= 0.96601967120274407325842915516032939D+00 + pw(21)= 0.13004209336797472935438387369012951D+01 + pw(22)= 0.17656287674999046987390122872170472D+01 + pw(23)= 0.24427681629137658222037012630588612D+01 + pw(24)= 0.35307546103133160002458888441211917D+01 + pw(25)= 0.58145056190411423438373377369723833D+01 +endif +if(kn == 26) then + px( 1)= 0.55477997323631075943068059436768679D-03 + px( 2)= 0.29524051251057578595813790584093855D-02 + px( 3)= 0.73901342939831943165981049698675567D-02 + px( 4)= 0.14101412872584234971190262785177463D-01 + px( 5)= 0.23468728331528566192654626405819210D-01 + px( 6)= 0.36081354457771003437927577005080307D-01 + px( 7)= 0.52815777662090440120493976780463113D-01 + px( 8)= 0.74938112557816762151465822385814066D-01 + px( 9)= 0.10422602290395549848995179022541619D+00 + px(10)= 0.14311379843177524483131355370021475D+00 + px(11)= 0.19487526595566569453971371848061316D+00 + px(12)= 0.26386679946717607062454498658707156D+00 + px(13)= 0.35585463477981546153201478673425350D+00 + px(14)= 0.47845228003668789580973663677048048D+00 + px(15)= 0.64169971258466993649179145930192297D+00 + px(16)= 0.85882779202859317392945336605019422D+00 + px(17)= 0.11472706757497270472184438297710363D+01 + px(18)= 0.15300218181930435465993945481075142D+01 + px(19)= 0.20374900555366768284567833572011801D+01 + px(20)= 0.27101363667403762213767958798451042D+01 + px(21)= 0.36024461925895008266595796162422643D+01 + px(22)= 0.47894509506391986224214442346207227D+01 + px(23)= 0.63787680281602106661197309343576824D+01 + px(24)= 0.85365750410497181159653468391328011D+01 + px(25)= 0.11557476465733555694328533876083768D+02 + px(26)= 0.16141471440934226918800792463390524D+02 + pw( 1)= 0.14270457888511795867992646035078330D-02 + pw( 2)= 0.33889549852374609060800978480342579D-02 + pw( 3)= 0.55247507422590039192859514393700253D-02 + pw( 4)= 0.79606058991690688534072920870913751D-02 + pw( 5)= 0.10870893529433384055135393826453360D-01 + pw( 6)= 0.14498723203402237674680094273702632D-01 + pw( 7)= 0.19179116372035461554716799339102955D-01 + pw( 8)= 0.25359449796805531683017831774564858D-01 + pw( 9)= 0.33619093306389351199382063563600456D-01 + pw(10)= 0.44698102947503307879411438578940957D-01 + pw(11)= 0.59545498569788992362177569759542076D-01 + pw(12)= 0.79391390756669405807121619168425508D-01 + pw(13)= 0.10584364527942384492476511067473085D+00 + pw(14)= 0.14101237929374191599558996117239049D+00 + pw(15)= 0.18767097313836850144218059320809785D+00 + pw(16)= 0.24946862930962964921754344345139933D+00 + pw(17)= 0.33121906265157831801486301115689293D+00 + pw(18)= 0.43930860961154203611926221969681638D+00 + pw(19)= 0.58230770087411222173176311911927048D+00 + pw(20)= 0.77196313761963428481446391672957021D+00 + pw(21)= 0.10249770981257475300463739775919616D+01 + pw(22)= 0.13665866627564720110102420582560601D+01 + pw(23)= 0.18387931590238615303343058387625137D+01 + pw(24)= 0.25228070818980592228138174960180006D+01 + pw(25)= 0.36185923153701902774641344711352413D+01 + pw(26)= 0.59170478659334870019465678075549678D+01 +endif +if(kn == 27) then + px( 1)= 0.53351049035404632679792315747760926D-03 + px( 2)= 0.28370305792687798355464055618059207D-02 + px( 3)= 0.70911216690725917919709661777310467D-02 + px( 4)= 0.13501035871356777604646936827660401D-01 + px( 5)= 0.22399961695605187088994736400122990D-01 + px( 6)= 0.34296092152256244946377676950669096D-01 + px( 7)= 0.49938369643043464349412733250120371D-01 + px( 8)= 0.70401007218931052290887962696404315D-01 + px( 9)= 0.97184515439527403402719960641852663D-01 + px(10)= 0.13233419679650342626453438780924062D+00 + px(11)= 0.17858535768221430523992541052027664D+00 + px(12)= 0.23955186967002279067082055689473205D+00 + px(13)= 0.31997738343786115574032202180084953D+00 + px(14)= 0.42606917808870858677237819912563285D+00 + px(15)= 0.56593757336952698275709554352322422D+00 + px(16)= 0.75017087050170008688238981674255895D+00 + px(17)= 0.99258767627203407312800418206037430D+00 + px(18)= 0.13112276119862221456283706350994460D+01 + px(19)= 0.17296743945926671138206419268165977D+01 + px(20)= 0.22788674435500397192690477951071238D+01 + px(21)= 0.29996852862458953100640794763905730D+01 + px(22)= 0.39468640965897806748062604397538661D+01 + px(23)= 0.51954835049739316659023548298272390D+01 + px(24)= 0.68530258860308177417175245026171396D+01 + px(25)= 0.90855060116903171155308252608094055D+01 + px(26)= 0.12187875689542501045614051917426613D+02 + px(27)= 0.16863234023420842677139640506273211D+02 + pw( 1)= 0.13720916834443970883055143999864006D-02 + pw( 2)= 0.32534222826312859680330393960878445D-02 + pw( 3)= 0.52884203958454954233826913886323010D-02 + pw( 4)= 0.75861682815905379946822876423968075D-02 + pw( 5)= 0.10295285639112777079908697606376495D-01 + pw( 6)= 0.13620382459545697869019461721069283D-01 + pw( 7)= 0.17841523884178755603525268045244929D-01 + pw( 8)= 0.23331884086195134219312146516778989D-01 + pw( 9)= 0.30573638051336305998992920739412241D-01 + pw(10)= 0.40178513173563771124515253029295068D-01 + pw(11)= 0.52921980029762362141777841892525060D-01 + pw(12)= 0.69796141123370781915050562748149785D-01 + pw(13)= 0.92082178946617789793924526199756695D-01 + pw(14)= 0.12144364501581543284664137255263233D+00 + pw(15)= 0.16004546398125829773973180676428128D+00 + pw(16)= 0.21070793305215927164839501305159383D+00 + pw(17)= 0.27711057981819535787363684480367458D+00 + pw(18)= 0.36407033983503565406434335329539225D+00 + pw(19)= 0.47793795110360227457939282572737921D+00 + pw(20)= 0.62719851165950099532441318332470642D+00 + pw(21)= 0.82345785449976155240467438227528273D+00 + pw(22)= 0.10832282284259041619866566987699138D+01 + pw(23)= 0.14315401678117573067846471990768551D+01 + pw(24)= 0.19102558501900861570470238950653852D+01 + pw(25)= 0.26007234476536547511161089599236255D+01 + pw(26)= 0.37039892290540711004993184577181103D+01 + pw(27)= 0.60168669305235587630364842700538762D+01 +endif +if(kn == 28) then + px( 1)= 0.51381795837439443363513212680713947D-03 + px( 2)= 0.27304457410704199858650659828620367D-02 + px( 3)= 0.68160137769119765426643490207146340D-02 + px( 4)= 0.12952008943105054557760398423212956D-01 + px( 5)= 0.21430556623995591107026232001066913D-01 + px( 6)= 0.32693077435416129189765100617149847D-01 + px( 7)= 0.47384339361452651219408943088094832D-01 + px( 8)= 0.66422538658869906464895332297645726D-01 + px( 9)= 0.91083609265701992171879031038032271D-01 + px(10)= 0.12309931515680468843361103443044334D+00 + px(11)= 0.16477454361350713713658223760718724D+00 + px(12)= 0.21913585560877712250760133646822826D+00 + px(13)= 0.29012664768975497883815066595179815D+00 + px(14)= 0.38286487674840579443374001038670456D+00 + px(15)= 0.50398058179729621330814443966676445D+00 + px(16)= 0.66205451410726746024107068348273274D+00 + px(17)= 0.86818664253955900521872813145997494D+00 + px(18)= 0.11367350670636899400604869200366967D+01 + px(19)= 0.14862849200104679887431662518488448D+01 + px(20)= 0.19409402276831122951996033127610769D+01 + px(21)= 0.25320952507423009788362491203678215D+01 + px(22)= 0.33009718613493033820591483341014688D+01 + px(23)= 0.43024948141012255149771188379035923D+01 + px(24)= 0.56117545045133971321428621533897157D+01 + px(25)= 0.73360976078470116280524716610880778D+01 + px(26)= 0.96414187592042593009284671180300365D+01 + px(27)= 0.12823096392169689557014539066503465D+02 + px(28)= 0.17587400071042305461725873529323834D+02 + pw( 1)= 0.13212378315760757100801294124815478D-02 + pw( 2)= 0.31285477801160114283501913408130908D-02 + pw( 3)= 0.50723762615025144279692888527822532D-02 + pw( 4)= 0.72476698125814896247817149986171532D-02 + pw( 5)= 0.97820955598317266921336611328577874D-02 + pw( 6)= 0.12849233584715652412922486858932270D-01 + pw( 7)= 0.16684914458309663575212478712400069D-01 + pw( 8)= 0.21602517467719514930691067702478109D-01 + pw( 9)= 0.28006326728639319667000527469785398D-01 + pw(10)= 0.36406794940307839233444412699072219D-01 + pw(11)= 0.47444873633013210207704765440613480D-01 + pw(12)= 0.61930727493054842346818886062879310D-01 + pw(13)= 0.80898273560459582900261577117872215D-01 + pw(14)= 0.10567594748292560400210313567244803D+00 + pw(15)= 0.13797620661979101239766714101470269D+00 + pw(16)= 0.18000947429659496345933986602370689D+00 + pw(17)= 0.23463186153608726981379368239281040D+00 + pw(18)= 0.30554132794139847744947603213040709D+00 + pw(19)= 0.39754682244155206328794798554812257D+00 + pw(20)= 0.51695511158335980560554454196568031D+00 + pw(21)= 0.67216328370835447637864341433166332D+00 + pw(22)= 0.87464250694849374310652866430377750D+00 + pw(23)= 0.11407332853853718344895028360165344D+01 + pw(24)= 0.14952907674277354153396565164371127D+01 + pw(25)= 0.19800794802566744638917545875047599D+01 + pw(26)= 0.26766284602650483548086237399782999D+01 + pw(27)= 0.37870913379223214170502998382833988D+01 + pw(28)= 0.61141264493435815963177300026057897D+01 +endif +if(kn == 29) then + px( 1)= 0.49553294523216557706121043094899937D-03 + px( 2)= 0.26316733478032430241054271443133571D-02 + px( 3)= 0.65619932921719095950945647167842097D-02 + px( 4)= 0.12447798146998465679635850497317547D-01 + px( 5)= 0.20546725218853503579585565380355127D-01 + px( 6)= 0.31244723477287375693064121565353181D-01 + px( 7)= 0.45100744250336127000529420647899915D-01 + px( 8)= 0.62905104337181109718071876082818174D-01 + px( 9)= 0.85750236217882191338049895922209411D-01 + px(10)= 0.11511276616778913843095284495474961D+00 + px(11)= 0.15294975780371902992191410154541288D+00 + px(12)= 0.20181754709239365590998602589064153D+00 + px(13)= 0.26502522465295890912360438565915637D+00 + px(14)= 0.34683572458174845871862351357320832D+00 + px(15)= 0.45272795494425557401215215391241853D+00 + px(16)= 0.58973560996706034541847942850272405D+00 + px(17)= 0.76688297098562182278932098742313991D+00 + px(18)= 0.99574549679776398032257261690761656D+00 + px(19)= 0.12911746195797983348858102746352939D+01 + px(20)= 0.16722452518287396697078251913438203D+01 + px(21)= 0.21635184617418031866723236843209620D+01 + px(22)= 0.27967767483991669703409180904294707D+01 + px(23)= 0.36135221397956750546277975652240522D+01 + px(24)= 0.46688173594398401975825823496477303D+01 + px(25)= 0.60377360285854709978780788792051344D+01 + px(26)= 0.78274940573494599108741305430804683D+01 + px(27)= 0.10203907613754336363839677845176062D+02 + px(28)= 0.13462852450631673420328416214301979D+02 + px(29)= 0.18313825184349771602172383716912527D+02 + pw( 1)= 0.12740401404642070584625642978316674D-02 + pw( 2)= 0.30131005567070877653374960845857871D-02 + pw( 3)= 0.48740340828768558996439695359657849D-02 + pw( 4)= 0.69399937174499322748248258405693356D-02 + pw( 5)= 0.93214421069590292260830707831625273D-02 + pw( 6)= 0.12166736320428934613437189621950636D-01 + pw( 7)= 0.15675841191975760517355581582896129D-01 + pw( 8)= 0.20113588991116143831392394792507242D-01 + pw( 9)= 0.25821196242227502811986031677074846D-01 + pw(10)= 0.33228090412405890048263783552877789D-01 + pw(11)= 0.42869421776728397919368187941044920D-01 + pw(12)= 0.55414351585478714766063918835679369D-01 + pw(13)= 0.71707159949828379722440698604318357D-01 + pw(14)= 0.92821391683596645191185627202531776D-01 + pw(15)= 0.12012815275736363503642593112420207D+00 + pw(16)= 0.15538195214533728663259981412132171D+00 + pw(17)= 0.20083006704520997993532092391882379D+00 + pw(18)= 0.25935464058941676571399243604292685D+00 + pw(19)= 0.33466206352796625147483719389400201D+00 + pw(20)= 0.43154442816926686793937626854331538D+00 + pw(21)= 0.55625869020489676787739935371459343D+00 + pw(22)= 0.71711352998990444136642620781136560D+00 + pw(23)= 0.92545309501307835297306650109719589D+00 + pw(24)= 0.11974638996261713170731294345952752D+01 + pw(25)= 0.15578539402639460936307650070163272D+01 + pw(26)= 0.20483259849497670771611527406588667D+01 + pw(27)= 0.27506256539950821112799212513803514D+01 + pw(28)= 0.38680317406459942528573299847997115D+01 + pw(29)= 0.62089749987608497692655462325987093D+01 +endif +if(kn == 30) then + px( 1)= 0.47850950616663001797979108558563715D-03 + px( 2)= 0.25398771556977649765987433184628366D-02 + px( 3)= 0.63266767276005636636918971079049107D-02 + px( 4)= 0.11982958698830863481679603697851263D-01 + px( 5)= 0.19737172990592845187334346934200923D-01 + px( 6)= 0.29928809539955562795388150514477224D-01 + px( 7)= 0.43045554614365273736449489448924963D-01 + px( 8)= 0.59772102008795894644542468648119854D-01 + px( 9)= 0.81049794509547598102960610576252333D-01 + px(10)= 0.10814588049834038786751975710018494D+00 + px(11)= 0.14273357759635461536908362003919156D+00 + px(12)= 0.18698857852398412902900671067259807D+00 + px(13)= 0.24371127323293364985980734512310093D+00 + px(14)= 0.31648527240822183351218895583590417D+00 + px(15)= 0.40988303161683259239808922937843934D+00 + px(16)= 0.52973043830387724896707606257507982D+00 + px(17)= 0.68344507067026327204085106743008657D+00 + px(18)= 0.88046769972296521320190133680773247D+00 + px(19)= 0.11328140045719997686900182451437385D+01 + px(20)= 0.14557849987988940136735253178131959D+01 + px(21)= 0.18688939211845819607060281336643708D+01 + px(22)= 0.23971019483057447245002052718973892D+01 + px(23)= 0.30725214791534315669485745292475362D+01 + px(24)= 0.39368810885201871521056281051541589D+01 + px(25)= 0.50453405183264296277555982679398449D+01 + px(26)= 0.64729369863961542161639397127927023D+01 + px(27)= 0.83267646024511909496921957165518850D+01 + px(28)= 0.10772601108679545696397581453580679D+02 + px(29)= 0.14106882472397927743465848536293191D+02 + px(30)= 0.19042377139722396576925840568797290D+02 + pw( 1)= 0.12301166672602858132749403305585138D-02 + pw( 2)= 0.29060349388917145213464128010319648D-02 + pw( 3)= 0.46912396989831005493100286944698875D-02 + pw( 4)= 0.66589658633863613568668981891460303D-02 + pw( 5)= 0.89054272701749525193189873717149483D-02 + pw( 6)= 0.11558310934603853176421212815818196D-01 + pw( 7)= 0.14788296604172157875123267577237353D-01 + pw( 8)= 0.18820536178193913710320280585533146D-01 + pw( 9)= 0.23944736371120032144351948458331453D-01 + pw(10)= 0.30524720054213913983988370110313231D-01 + pw(11)= 0.39011278230203159299747340391501401D-01 + pw(12)= 0.49962838489107032100208706917355628D-01 + pw(13)= 0.64076419786867156599743093793566683D-01 + pw(14)= 0.82229278957901320956487642041730955D-01 + pw(15)= 0.10553162871986467114052832148689872D+00 + pw(16)= 0.13539231869402525697097331669289294D+00 + pw(17)= 0.17360130147051396517985693386136263D+00 + pw(18)= 0.22243486135302704942601690670544171D+00 + pw(19)= 0.28479267765730766074850970825016903D+00 + pw(20)= 0.36438127580923131688892141299329716D+00 + pw(21)= 0.46596902265107476258412993277856600D+00 + pw(22)= 0.59575928418635779724375261556025579D+00 + pw(23)= 0.76197365269330802425420319974692784D+00 + pw(24)= 0.97583770654840390135150485526335360D+00 + pw(25)= 0.12534009762825410047987823857212526D+01 + pw(26)= 0.16192497318165861534514297610388291D+01 + pw(27)= 0.21150559758743170143427430064927639D+01 + pw(28)= 0.28228114202692982120171862876336191D+01 + pw(29)= 0.39469320964297990895048746179623006D+01 + pw(30)= 0.63015479148815916165232775496615043D+01 +endif +if(kn == 31) then + px( 1)= 0.46262124155980990900658158435387435D-03 + px( 2)= 0.24543371823976573134043990222488764D-02 + px( 3)= 0.61080328529962740945375030974273550D-02 + px( 4)= 0.11552913696384311289392080456928671D-01 + px( 5)= 0.18992554406021427042667932899509712D-01 + px( 6)= 0.28727239767175649142836330874634688D-01 + px( 7)= 0.41185025043396586435794925588583817D-01 + px( 8)= 0.56962729337650718045473296117050346D-01 + px( 9)= 0.76876408592321896648696565625767740D-01 + px(10)= 0.10202017613750426995368814592543654D+00 + px(11)= 0.13383358226439616664891093380483172D+00 + px(12)= 0.17418111785696519564699242954723688D+00 + px(13)= 0.22545079288954335283945305080525555D+00 + px(14)= 0.29068039334037578871072959727864135D+00 + px(15)= 0.37372028651206644042103206921271373D+00 + px(16)= 0.47944207171169672906363477630137984D+00 + px(17)= 0.61400401892256187595697112035275391D+00 + px(18)= 0.78518739405456531463788153132800130D+00 + px(19)= 0.10028226298243299869022545200635749D+01 + px(20)= 0.12793315955946422764804637028085576D+01 + px(21)= 0.16304237404723823714635864157191701D+01 + px(22)= 0.20760033968995280036753528016386266D+01 + px(23)= 0.26413808369702250796358745676110842D+01 + px(24)= 0.33589474379157213932868619266312477D+01 + px(25)= 0.42706130083047912404753160217494471D+01 + px(26)= 0.54316014237021642862351869609909389D+01 + px(27)= 0.69169001596879784406900118967237735D+01 + px(28)= 0.88334933035422052410960663437339313D+01 + px(29)= 0.11347158219793510037091924273351551D+02 + px(30)= 0.14754946949122040546764367410101515D+02 + px(31)= 0.19772934489331283651619534072783573D+02 + pw( 1)= 0.11891370745192323541501363233610476D-02 + pw( 2)= 0.28064565117632969596616229162402416D-02 + pw( 3)= 0.45221820039177658202628966334564400D-02 + pw( 4)= 0.64011462699419650278998414745094563D-02 + pw( 5)= 0.85276672843451997393681235521893987D-02 + pw( 6)= 0.11012361472143567538937306026897082D-01 + pw( 7)= 0.14001834101185433452944676115710984D-01 + pw( 8)= 0.17688602970920088694526112750101821D-01 + pw( 9)= 0.22319993264776239711478935025076064D-01 + pw(10)= 0.28206159264133299993412221361362783D-01 + pw(11)= 0.35729779624362463483758438125027756D-01 + pw(12)= 0.45361209930192015109636119041934318D-01 + pw(13)= 0.57681745959882977997939097436077625D-01 + pw(14)= 0.73415730766069123013128503531856238D-01 + pw(15)= 0.93471597750154348901340915093546613D-01 + pw(16)= 0.11899278980540690788248491553604027D+00 + pw(17)= 0.15142094037875549556146133150023155D+00 + pw(18)= 0.19257526453331204560446543583285304D+00 + pw(19)= 0.24475404687248924789069417791551804D+00 + pw(20)= 0.31086721331790994056051025981812504D+00 + pw(21)= 0.39461463811187169999440896691515521D+00 + pw(22)= 0.50073578853625257469356342726971100D+00 + pw(23)= 0.63537831862561035023942323431330555D+00 + pw(24)= 0.80667931860754124288487929852367162D+00 + pw(25)= 0.10257546571965496140352844249415068D+01 + pw(26)= 0.13085329420794059702124934248171024D+01 + pw(27)= 0.16795015057251400866456256943483557D+01 + pw(28)= 0.21803283205051475095028864283497279D+01 + pw(29)= 0.28932755231818831561701787327064654D+01 + pw(30)= 0.40239038808605675586573707403263756D+01 + pw(31)= 0.63919688446180182394784276398611383D+01 +endif +if(kn == 32) then + px( 1)= 0.44775812457284018024389459757135253D-03 + px( 2)= 0.23744300256081390660731331947131448D-02 + px( 3)= 0.59043190530701947087959015074527131D-02 + px( 4)= 0.11153785163059738085602585751236982D-01 + px( 5)= 0.18305065328015338438613004118489460D-01 + px( 6)= 0.27625127970368047372371104083117018D-01 + px( 7)= 0.39491775186977265606198670142306198D-01 + px( 8)= 0.54428213199640073895058009136940445D-01 + px( 9)= 0.73145916598392207837439901430911881D-01 + px(10)= 0.96594842317146387626737019235932073D-01 + px(11)= 0.12602067507135613816122919853447211D+00 + px(12)= 0.16303119243744080803051027229475087D+00 + px(13)= 0.20967680096838206807696535069330795D+00 + px(14)= 0.26855214787001775388966483767701730D+00 + px(15)= 0.34292617657317298468348748407414908D+00 + px(16)= 0.43690813207789007326976756359678361D+00 + px(17)= 0.55565789063347173772943038765658058D+00 + px(18)= 0.70565099753808518503172131364185913D+00 + px(19)= 0.89501204411614229330337167600565325D+00 + px(20)= 0.11339348063954267650829490706793417D+01 + px(21)= 0.14352147947875517113501672147412666D+01 + px(22)= 0.18149314500489324404656062560197686D+01 + px(23)= 0.22933370525154514327968223407435420D+01 + px(24)= 0.28960454333191297694038803298227801D+01 + px(25)= 0.36556826421582258456995650363192002D+01 + px(26)= 0.46143015669042314317688786912840126D+01 + px(27)= 0.58271640488452237241101113536108246D+01 + px(28)= 0.73691994618471152327249495515078891D+01 + px(29)= 0.93472955333283431319402613994267131D+01 + px(30)= 0.11927265101386731589772177284607045D+02 + px(31)= 0.15406825810545224143258580731663182D+02 + px(32)= 0.20505385359315843909738804845728816D+02 + pw( 1)= 0.11508141693310289431516002902194819D-02 + pw( 2)= 0.27135954170378174829279299706770743D-02 + pw( 3)= 0.43653263744209382080072687149869796D-02 + pw( 4)= 0.61636735125379730211346708661816263D-02 + pw( 5)= 0.81829488620066496094938528145972129D-02 + pw( 6)= 0.10519566782024470065554644097490174D-01 + pw( 7)= 0.13300211432891686360240172032295580D-01 + pw( 8)= 0.16690405167066082789755182863245534D-01 + pw( 9)= 0.20902378399561394483383583702025306D-01 + pw(10)= 0.26201993681677797113818409392334214D-01 + pw(11)= 0.32916309597556149563466321558626500D-01 + pw(12)= 0.41444783686249708806280965671571355D-01 + pw(13)= 0.52276724501596285458733044616038927D-01 + pw(14)= 0.66016045915514602434595691235181054D-01 + pw(15)= 0.83413404413409668999356206612085646D-01 + pw(16)= 0.10540610055778962043592758135111163D+00 + pw(17)= 0.13316716078519390466674733202657980D+00 + pw(18)= 0.16816620873421840783275153829642565D+00 + pw(19)= 0.21224605690896243222098764241728942D+00 + pw(20)= 0.26772081279710238967904075331055395D+00 + pw(21)= 0.33750446543867324459731453510204593D+00 + pw(22)= 0.42528477444911122706373139673310844D+00 + pw(23)= 0.53576855465657922120503664249148172D+00 + pw(24)= 0.67504694124786279657121038348068956D+00 + pw(25)= 0.85117601343595945001255999545786031D+00 + pw(26)= 0.10751709034800588277387396074134080D+01 + pw(27)= 0.13628543051087082785935353862196062D+01 + pw(28)= 0.17386349660199069072599128814460612D+01 + pw(29)= 0.22441998697757930470131299950344772D+01 + pw(30)= 0.29621015955004271568195551910771395D+01 + pw(31)= 0.40990494779803030065007758020292907D+01 + pw(32)= 0.64803510743879183341918928981358705D+01 +endif +if(kn == 33) then + px( 1)= 0.43382392840086627333673371770410791D-03 + px( 2)= 0.22996130762133902306219348170634844D-02 + px( 3)= 0.57140311482877757226936621231899489D-02 + px( 4)= 0.10782263460575904849778834680881905D-01 + px( 5)= 0.17668133941325186930637700801463696D-01 + px( 6)= 0.26610114005745400437311454869685139D-01 + px( 7)= 0.37943370827578419250767127256762961D-01 + px( 8)= 0.52129041240975080218154665697626616D-01 + px( 9)= 0.69790754616119288011923380403081974D-01 + px(10)= 0.91757677110053287920628073294896820D-01 + px(11)= 0.11911351898045213836878312985804824D+00 + px(12)= 0.15325258383592569039382713986292282D+00 + px(13)= 0.19594640045311118270956256575773479D+00 + px(14)= 0.24942637714448661140410427601352782D+00 + px(15)= 0.31648861015431273270737411206235176D+00 + px(16)= 0.40062704984338027141229486836727772D+00 + px(17)= 0.50620162303052368324000671362943743D+00 + px(18)= 0.63864913663861731335041615079256755D+00 + px(19)= 0.80474696130540033570426941368276095D+00 + px(20)= 0.10129427268888740212312948144945017D+01 + px(21)= 0.12737679814899102853785194013828910D+01 + px(22)= 0.16003609719649203492661497770183358D+01 + px(23)= 0.20091354103290292174609108669156369D+01 + px(24)= 0.25206522780436439397099624663858851D+01 + px(25)= 0.31607884518783084627961967518655955D+01 + px(26)= 0.39623662204110196024719874486085284D+01 + px(27)= 0.49675495548589556752790709024651368D+01 + px(28)= 0.62316176745314993859698590995521575D+01 + px(29)= 0.78294374119411180631665027515007813D+01 + px(30)= 0.98678149766403120157427026982285647D+01 + px(31)= 0.12512632243648644312040507648114218D+02 + px(32)= 0.16062316312799007384705365546800182D+02 + px(33)= 0.21239626413140571003413447643564263D+02 + pw( 1)= 0.11148970595521907012019732812607055D-02 + pw( 2)= 0.26267851653678976569258782421234610D-02 + pw( 3)= 0.42193631422457547168621098688321193D-02 + pw( 4)= 0.59441470797152969047545135707624789D-02 + pw( 5)= 0.78669738899974671911070000143132306D-02 + pw( 6)= 0.10072359597590157564525137294462730D-01 + pw( 7)= 0.12670399361862170180506056379944600D-01 + pw( 8)= 0.15804160264438711138334353671976234D-01 + pw( 9)= 0.19656646222033544724001127769742948D-01 + pw(10)= 0.24456891716604317647752096255377977D-01 + pw(11)= 0.30486078976025291042502816433302855D-01 + pw(12)= 0.38085942822052281180984432235900915D-01 + pw(13)= 0.47671859587250413816889365465789990D-01 + pw(14)= 0.59751905747580346721160295363733596D-01 + pw(15)= 0.74952101551419066714546625345670463D-01 + pw(16)= 0.94047940645329027189027157260871966D-01 + pw(17)= 0.11800297644299036213439491803576584D+00 + pw(18)= 0.14801615944289654934244517014490498D+00 + pw(19)= 0.18558059212548173745682851099099352D+00 + pw(20)= 0.23255756871893273043599614945440298D+00 + pw(21)= 0.29127163202367449192902938057774536D+00 + pw(22)= 0.36463564879320343679994202632934432D+00 + pw(23)= 0.45632094652154984485718193074718634D+00 + pw(24)= 0.57099911532700834474505095568484057D+00 + pw(25)= 0.71470499479041253227766745277939393D+00 + pw(26)= 0.89541776346343118676696712049315123D+00 + pw(27)= 0.11240606917291058823868093321319567D+01 + pw(28)= 0.14163644716761897658439794410119483D+01 + pw(29)= 0.17966773927205272792519385809395869D+01 + pw(30)= 0.23067252939728463992747791116176630D+01 + pw(31)= 0.30293676082841986853392483236331039D+01 + pw(32)= 0.41724631327442166841549815997399232D+01 + pw(33)= 0.65667986737889171343050053820031099D+01 +endif +if(kn == 34) then + px( 1)= 0.42073412472404638893756652613349609D-03 + px( 2)= 0.22294117501223585040813842121338791D-02 + px( 3)= 0.55358634424631592178894736933123541D-02 + px( 4)= 0.10435505221909818911938312253494801D-01 + px( 5)= 0.17076183522530106340715524871890215D-01 + px( 6)= 0.25671846892716699934599581849440412D-01 + px( 7)= 0.36521262393930458152775347026295878D-01 + px( 8)= 0.50032905506982571537052624490236219D-01 + px( 9)= 0.66756179329851771946428938115562858D-01 + px(10)= 0.87418440517600995276435533200107158D-01 + px(11)= 0.11296720871152482793171861979004167D+00 + px(12)= 0.14461798209060507185491668160548863D+00 + px(13)= 0.18391003445626398421131159916752177D+00 + px(14)= 0.23277438392452600064632190536358711D+00 + px(15)= 0.29361900416298607661742971283165405D+00 + px(16)= 0.36943648509469646191303516064166821D+00 + px(17)= 0.46393948817397095233405427748578045D+00 + px(18)= 0.58173003909628716647282591793935082D+00 + px(19)= 0.72851013492593350843955115700236997D+00 + px(20)= 0.91134335818633179015533214138153748D+00 + px(21)= 0.11389803746065945729570613409749246D+01 + px(22)= 0.14222658613929989350760708533653619D+01 + px(23)= 0.17746516415212834951524711400325322D+01 + px(24)= 0.22128526659161258391280177907354742D+01 + px(25)= 0.27577030475922356628315880749135163D+01 + px(26)= 0.34353067716163467499117583277012187D+01 + px(27)= 0.42786491225875840394307069342936437D+01 + px(28)= 0.53299784436903981664272138662385839D+01 + px(29)= 0.66445753694632339412682622877981687D+01 + px(30)= 0.82972428161435530717136911066799313D+01 + px(31)= 0.10394720964339982803669830220514570D+02 + px(32)= 0.13102991988894450504911092547924425D+02 + px(33)= 0.16721231207604090231249152699717947D+02 + px(34)= 0.21975561953387058269749184334695681D+02 + pw( 1)= 0.10811655766145357018010507192110778D-02 + pw( 2)= 0.25454456692504695230374260776656758D-02 + pw( 3)= 0.40831672721796992810291858236827121D-02 + pw( 4)= 0.57405373464649057612542381878716024D-02 + pw( 5)= 0.75761674957710259616551754950037804D-02 + pw( 6)= 0.96645393330211172110854747921485211D-02 + pw( 7)= 0.12101849132180839391980047456808267D-01 + pw( 8)= 0.15012385166203703798856896165367048D-01 + pw( 9)= 0.18554685664439102794960148200939741D-01 + pw(10)= 0.22926965460919156862511787954586100D-01 + pw(11)= 0.28372234446912034618472293586026471D-01 + pw(12)= 0.35184737546237040728179133891634893D-01 + pw(13)= 0.43719795721391424383284929004984363D-01 + pw(14)= 0.54408450285118084483187831793976080D-01 + pw(15)= 0.67777311678116535143958157494852949D-01 + pw(16)= 0.84473623053475603301350103260219498D-01 + pw(17)= 0.10529589998535694380806537413267952D+00 + pw(18)= 0.13123120457466970233581600829200986D+00 + pw(19)= 0.16350086670529523991639926152378893D+00 + pw(20)= 0.20361729924366521691008806803033027D+00 + pw(21)= 0.25345571161549014721855872141678839D+00 + pw(22)= 0.31534642949457072923890964012293452D+00 + pw(23)= 0.39219690177745437575400107500840528D+00 + pw(24)= 0.48765869057533013816866773942301700D+00 + pw(25)= 0.60636656060951114856441189594293935D+00 + pw(26)= 0.75430007270808930870580918261471173D+00 + pw(27)= 0.93936601064002840188234207354112438D+00 + pw(28)= 0.11724044094922051599035401593357573D+01 + pw(29)= 0.14690667743714044794229623979582298D+01 + pw(30)= 0.18536570456604450828475022635489700D+01 + pw(31)= 0.23679569977291335149484204767547713D+01 + pw(32)= 0.30951463106197815052582135293694448D+01 + pw(33)= 0.42442317841513325464351782732665369D+01 + pw(34)= 0.66514074844570206292427905989816795D+01 +endif +if(kn == 35) then + px( 1)= 0.40841415548604021515920964002399699D-03 + px( 2)= 0.21634090823285078411638264192646176D-02 + px( 3)= 0.53686766256969092483856645653707325D-02 + px( 4)= 0.10111052749147679432588829471203234D-01 + px( 5)= 0.16524448303260385083329478751404387D-01 + px( 6)= 0.24801589512429266354562963867459375D-01 + px( 7)= 0.35209981971294629902224557027954734D-01 + px( 8)= 0.48113158246267914322027983645621137D-01 + px( 9)= 0.63997447579595411241665965285069235D-01 + px(10)= 0.83503921106837065850088749934585509D-01 + px(11)= 0.10746492183189159858451002479100191D+00 + px(12)= 0.13694520147247420040072594436271529D+00 + px(13)= 0.17328916996785784591140856733770825D+00 + px(14)= 0.21817740121438829093802932764502942D+00 + px(15)= 0.27369653832797779961249433698659440D+00 + px(16)= 0.34242699906149186851837117836266744D+00 + px(17)= 0.42755291549796611179923030387067599D+00 + px(18)= 0.53299909554182158428637150014155340D+00 + px(19)= 0.66360070964681593124502281725999578D+00 + px(20)= 0.82531292882837242125696121653082133D+00 + px(21)= 0.10254699397214209538105608220109125D+01 + px(22)= 0.12731058972360937198805843671213415D+01 + px(23)= 0.15793550240911338542780101952325349D+01 + px(24)= 0.19579553515472245197868811537799063D+01 + px(25)= 0.24258926480980508600328748991363509D+01 + px(26)= 0.30042420249956940401285542677229005D+01 + px(27)= 0.37193027838966272865717795755051677D+01 + px(28)= 0.46041945352246751335584778757254638D+01 + px(29)= 0.57012278072022800341169042054436900D+01 + px(30)= 0.70656725104389005421933634969655744D+01 + px(31)= 0.87722686454754899579881428713693370D+01 + px(32)= 0.10927706100825575628425432470397431D+02 + px(33)= 0.13698096354347216687650292593862930D+02 + px(34)= 0.17383397148588794631800252981912492D+02 + px(35)= 0.22713103140185533251728138884147593D+02 + pw( 1)= 0.10494256988526777696106600190246748D-02 + pw( 2)= 0.24690695403322712989700976425347600D-02 + pw( 3)= 0.39557664886781607598248979520245308D-02 + pw( 4)= 0.55511159047852988512979064225320046D-02 + pw( 5)= 0.73075320876513631863948214849392405D-02 + pw( 6)= 0.92909810870320629850049703997767543D-02 + pw( 7)= 0.11585945487945439996692114097667348D-01 + pw( 8)= 0.14300927726198362348901143963929085D-01 + pw( 9)= 0.17573886933485282780152536853558582D-01 + pw(10)= 0.21577101857175674214083208317224449D-01 + pw(11)= 0.26521578525384159056539231969282277D-01 + pw(12)= 0.32662117530178998186687978899517756D-01 + pw(13)= 0.40304760583700356024679394121082348D-01 + pw(14)= 0.49818024406493848323885700890345396D-01 + pw(15)= 0.61648498278387927702353594142316029D-01 + pw(16)= 0.76340842519971003579540175209086024D-01 + pw(17)= 0.94562313841812711417919902804637839D-01 + pw(18)= 0.11713243611448668905011510723033361D+00 + pw(19)= 0.14505903302187588429490257418340878D+00 + pw(20)= 0.17958246056420660249030476513346108D+00 + pw(21)= 0.22223064193925674596660827103307295D+00 + pw(22)= 0.27488866495140173131704069241445894D+00 + pw(23)= 0.33988865913831065785773004058455433D+00 + pw(24)= 0.42012914561889293897263514396102618D+00 + pw(25)= 0.51923942630667304009247143973261919D+00 + pw(26)= 0.64181663147268449329914151912099321D+00 + pw(27)= 0.79378665914719288398623606417929212D+00 + pw(28)= 0.98298862630823929161929765759110274D+00 + pw(29)= 0.12201876100804632086249205242578178D+01 + pw(30)= 0.15209676736777719585360185637783971D+01 + pw(31)= 0.19096027011177832508222236236206329D+01 + pw(32)= 0.24279450923678125311674033890233300D+01 + pw(33)= 0.31595056381881229211813965570928908D+01 + pw(34)= 0.43144357961085612515538420099471387D+01 + pw(35)= 0.67342659787038143488625655899472123D+01 +endif +if(kn == 36) then + px( 1)= 0.39679800273035937692456569764529929D-03 + px( 2)= 0.21012371857514574429814634636370868D-02 + px( 3)= 0.52114717733736355471865173697424917D-02 + px( 4)= 0.98067697534794548776488857189272695D-02 + px( 5)= 0.16008829048610610861085787414831567D-01 + px( 6)= 0.23991913025422802197171963117636148D-01 + px( 7)= 0.33996529414545629919769339896568646D-01 + px( 8)= 0.46347640349961743012995653495638967D-01 + px( 9)= 0.61477688475695067453069318914244194D-01 + px(10)= 0.79954234262530625440149978974196142D-01 + px(11)= 0.10251169364078550312390054066216880D+00 + px(12)= 0.13008697511618887542147079025562156D+00 + px(13)= 0.16385989560749921185926347303565033D+00 + px(14)= 0.20530066324396880599546126878370330D+00 + px(15)= 0.25622775706568656239933185280550197D+00 + px(16)= 0.31887992047473434247218676095668498D+00 + px(17)= 0.39600601270766777602081700578327931D+00 + px(18)= 0.49097660869784478263571961512354166D+00 + px(19)= 0.60792179107232897416464078744063313D+00 + px(20)= 0.75190061348713360022081088193051979D+00 + px(21)= 0.92910925563205451252801298079690580D+00 + px(22)= 0.11471370551908310980346755006364606D+01 + px(23)= 0.14152827011474745738425470935393175D+01 + px(24)= 0.17449475260933662620173991491940897D+01 + px(25)= 0.21501302020535220058465237512287759D+01 + px(26)= 0.26480593984665582591226977578025315D+01 + px(27)= 0.32600222787200297912657865580064671D+01 + px(28)= 0.40124854077257132134482639353308378D+01 + px(29)= 0.49386780680939238173283121803851184D+01 + px(30)= 0.60809546519546778854776584514884910D+01 + px(31)= 0.74945653594942201002132544001003708D+01 + px(32)= 0.92541900969385753944715137178141946D+01 + px(33)= 0.11466484149563363201326491294746713D+02 + px(34)= 0.14297715117728357842590639672319382D+02 + px(35)= 0.18048653298710078396919876650411551D+02 + px(36)= 0.23452167308415151157069277254133065D+02 + pw( 1)= 0.10195057714700151361278566695804673D-02 + pw( 2)= 0.23972109360881477362914694357034945D-02 + pw( 3)= 0.38363158447342177123425836110343923D-02 + pw( 4)= 0.53744010839767438647013296257593752D-02 + pw( 5)= 0.70585351522933460613686733697482296D-02 + pw( 6)= 0.89474146486528988019752703037120466D-02 + pw( 7)= 0.11115594177177423398237576162176609D-01 + pw( 8)= 0.13658239363054657454269471936854599D-01 + pw( 9)= 0.16695920500566358906782047930663171D-01 + pw(10)= 0.20378981713905551927970998274789754D-01 + pw(11)= 0.24891420496520210335919294073059272D-01 + pw(12)= 0.30454995468399097263969267647541847D-01 + pw(13)= 0.37334924544188163864246559109559135D-01 + pw(14)= 0.45848499525488242728225859778156080D-01 + pw(15)= 0.56377323577091624057815057252308315D-01 + pw(16)= 0.69383296835632602012769529156119272D-01 + pw(17)= 0.85428366438632838815674340562473751D-01 + pw(18)= 0.10519836473842813187954075539197942D+00 + pw(19)= 0.12953172788969281249628541902884322D+00 + pw(20)= 0.15945437502320699870857044488189859D+00 + pw(21)= 0.19622256830996434214924230362068723D+00 + pw(22)= 0.24137631349243177937384183525770442D+00 + pw(23)= 0.29680703930305420431908018221051720D+00 + pw(24)= 0.36484530976842630509122266207581834D+00 + pw(25)= 0.44837789623740881716704249442275046D+00 + pw(26)= 0.55101005426642996452229825498001978D+00 + pw(27)= 0.67730110868383751051906031936646973D+00 + pw(28)= 0.83312535104283525888024444527691004D+00 + pw(29)= 0.10262590489160128936079715697075387D+01 + pw(30)= 0.12674001847129782122095271537034034D+01 + pw(31)= 0.15720761022006313395632370048802129D+01 + pw(32)= 0.19645432933987013230055443656345311D+01 + pw(33)= 0.24867374093856289898664232230940440D+01 + pw(34)= 0.32225090907915900251361180974071379D+01 + pw(35)= 0.43831496004036045715808964702697293D+01 + pw(36)= 0.68154560080748566632769223367688120D+01 +endif +if(kn == 37) then + px( 1)= 0.38582699810911109314412610582600285D-03 + px( 2)= 0.20425701942290192876130910611009103D-02 + px( 3)= 0.50633691210660737148684629408797998D-02 + px( 4)= 0.95207896708291718422773083291578438D-02 + px( 5)= 0.15525778682735758953134513377993514D-01 + px( 6)= 0.23236458251477143892762148491421503D-01 + px( 7)= 0.32869898315502381365977793756399700D-01 + px( 8)= 0.44717783821182009237592740999277480D-01 + px( 9)= 0.59166282378413474386219652173387790D-01 + px(10)= 0.76720016407611302507577044991006663D-01 + px(11)= 0.98029725568599136751267367485311366D-01 + px(12)= 0.12392331356464118155739304688774820D+00 + px(13)= 0.15544072437387706131982741857227680D+00 + px(14)= 0.19387425549171156055801006554376491D+00 + px(15)= 0.24081693190720646255336253748180444D+00 + px(16)= 0.29822205905402883495704063798959865D+00 + px(17)= 0.36847715057286422135924881742734873D+00 + px(18)= 0.45449546602004465241771763312107545D+00 + px(19)= 0.55982869948207391699702170763669914D+00 + px(20)= 0.68880504649787296744511192196333508D+00 + px(21)= 0.84669796033901533360995646038438838D+00 + px(22)= 0.10399324356558473561934598413040205D+01 + px(23)= 0.12763377882779834661586906782407420D+01 + px(24)= 0.15654589805265212137286807835074586D+01 + px(25)= 0.19189432205583964070987477122905362D+01 + px(26)= 0.23510260307961591423359389441199440D+01 + px(27)= 0.28791534439182687258499291870153821D+01 + px(28)= 0.35247986674670581588018777649051460D+01 + px(29)= 0.43145708343894232186639323097990119D+01 + px(30)= 0.52817877652195029549414854087069499D+01 + px(31)= 0.64688326916686594614365796118970700D+01 + px(32)= 0.79309297085796581254349823893132444D+01 + px(33)= 0.97427028243618144296296506331896705D+01 + px(34)= 0.12010788145502681400420281251059076D+02 + px(35)= 0.14901634128904268951043131468444574D+02 + px(36)= 0.18716850108988162034804184337830585D+02 + px(37)= 0.24192677368936148334878341907713337D+02 + pw( 1)= 0.99125336551824862194889482679760082D-03 + pw( 2)= 0.23294764153889055024996601218914981D-02 + pw( 3)= 0.37240772547834537676742266421935443D-02 + pw( 4)= 0.52091149342528848273366607113774535D-02 + pw( 5)= 0.68270221227859789716885624541247654D-02 + pw( 6)= 0.86302549871569688435939302492406456D-02 + pw( 7)= 0.10684907929606001826647765915296935D-01 + pw( 8)= 0.13074823650284587459124044679067858D-01 + pw( 9)= 0.15905815137012769943635728692865399D-01 + pw(10)= 0.19309592949421631021202995586098476D-01 + pw(11)= 0.23447231955796843102356124526512731D-01 + pw(12)= 0.28512602643555882480364043596258204D-01 + pw(13)= 0.34736803269082359319855891282489532D-01 + pw(14)= 0.42394778196431500823106022890118590D-01 + pw(15)= 0.51814900990726921245206073768999637D-01 + pw(16)= 0.63391754887352977638748380428399567D-01 + pw(17)= 0.77602103176771937423249866012581660D-01 + pw(18)= 0.95024188094564273453247510650970162D-01 + pw(19)= 0.11636084820208068310423973571076075D+00 + pw(20)= 0.14246733764500807361275561785541076D+00 + pw(21)= 0.17438513757282787980358846897741684D+00 + pw(22)= 0.21338354906834882772151573726954513D+00 + pw(23)= 0.26101159566291693588807442987832548D+00 + pw(24)= 0.31916397294414160306456288368801219D+00 + pw(25)= 0.39016685675452558044568607656388734D+00 + pw(26)= 0.47689304458379631259135969323199221D+00 + pw(27)= 0.58292255373275454371013059783773154D+00 + pw(28)= 0.71277724055369180393369777699867214D+00 + pw(29)= 0.87228215826925362283182363826585130D+00 + pw(30)= 0.10691555317433509592820312414251932D+01 + pw(31)= 0.13140356602358313756291999585335256D+01 + pw(32)= 0.16224029261605229346735838359067505D+01 + pw(33)= 0.20185076394260072518838683149008566D+01 + pw(34)= 0.25443795429863924820797324346795088D+01 + pw(35)= 0.32842160798477830990525297742207531D+01 + pw(36)= 0.44504422639266402367212097908451198D+01 + pw(37)= 0.68950534584189288110300715731079937D+01 +endif +if(kn == 38) then + px( 1)= 0.37544882641214739544404487691314902D-03 + px( 2)= 0.19871183957226785505693683210337852D-02 + px( 3)= 0.49235906141676391015802154321946533D-02 + px( 4)= 0.92514737512580193448510992395727495D-02 + px( 5)= 0.15072210889929832914852564552062275D-01 + px( 6)= 0.22529747565558688599709359818783270D-01 + px( 7)= 0.31820706479818463453808764992549706D-01 + px( 8)= 0.43207917804566229579979815937989733D-01 + px( 9)= 0.57037614703055219788340245932475170D-01 + px(10)= 0.73760278305643433875885418326364923D-01 + px(11)= 0.93954814072806105125535625384970094D-01 + px(12)= 0.11835572279751275812340426098616966D+00 + px(13)= 0.14788342387597970850254556771271985D+00 + px(14)= 0.18367881184867197784381084444678116D+00 + px(15)= 0.22714406717510235393336995794234796D+00 + px(16)= 0.27999230770730436478836636945630613D+00 + px(17)= 0.34430881911279975709736095638802572D+00 + px(18)= 0.42262660561797928237112864643574725D+00 + px(19)= 0.51801914793576724661446993655612171D+00 + px(20)= 0.63421369082081852676499438226656058D+00 + px(21)= 0.77572913890901541203968257694714068D+00 + px(22)= 0.94804373045176433040869132064288662D+00 + px(23)= 0.11577991608735723735204737419458983D+01 + px(24)= 0.14130499332620481162978195094240590D+01 + px(25)= 0.17235697910838497297077019443263907D+01 + px(26)= 0.21012318099421305470189826928381899D+01 + px(27)= 0.25604863031155004117350909468147106D+01 + px(28)= 0.31189733700354116534218512680792939D+01 + px(29)= 0.37983289491193925871766329237375059D+01 + px(30)= 0.46252830528399268638347963582117115D+01 + px(31)= 0.56332239825265713634766419255858690D+01 + px(32)= 0.68645515916046830561016745830456758D+01 + px(33)= 0.83744595975704688906750898197385297D+01 + px(34)= 0.10237521325244942798063968606118549D+02 + px(35)= 0.12560368707162995423911721031640974D+02 + px(36)= 0.15509653816608570103011114649053437D+02 + px(37)= 0.19387848243793684608247770401886161D+02 + px(38)= 0.24934561281635323945874914765928781D+02 + pw( 1)= 0.96453265301318427548094362468201990D-03 + pw( 2)= 0.22655173902979609807058505788853476D-02 + pw( 3)= 0.36184028900176044156504296271444603D-02 + pw( 4)= 0.50541489537150279336470105887486457D-02 + pw( 5)= 0.66111480676528721493086153967962652D-02 + pw( 6)= 0.83364709970141383682744167961798317D-02 + pw( 7)= 0.10288965239682486008404941690219778D-01 + pw( 8)= 0.12542814650103402794331337460320134D-01 + pw( 9)= 0.15191255363614986206935175799741358D-01 + pw(10)= 0.18350103409130288787013350611437802D-01 + pw(11)= 0.22160882105318682870151221782836377D-01 + pw(12)= 0.26793767627663720473790893510507021D-01 + pw(13)= 0.32451109787899399883524910095936794D-01 + pw(14)= 0.39372541954009629417292538828294592D-01 + pw(15)= 0.47842474290684012119499234323721647D-01 + pw(16)= 0.58200301955035707564120883548436910D-01 + pw(17)= 0.70853350044334604149559560301899009D-01 + pw(18)= 0.86292591208817884216614254072269241D-01 + pw(19)= 0.10511141477535783724613732416344133D+00 + pw(20)= 0.12802804286358628862797652201096219D+00 + pw(21)= 0.15591251149396720172437659323068423D+00 + pw(22)= 0.18981949259837187914390190669151597D+00 + pw(23)= 0.23102871661757404151219056182072738D+00 + pw(24)= 0.28109550616216987958688311291596089D+00 + pw(25)= 0.34191517456611856887400710976438959D+00 + pw(26)= 0.41580717321823436217712748531850542D+00 + pw(27)= 0.50562861773538920954280122211069833D+00 + pw(28)= 0.61493358954575527626904288123143367D+00 + pw(29)= 0.74820721198453838888752387245859917D+00 + pw(30)= 0.91122787666637256187790465660240598D+00 + pw(31)= 0.11116604876702928314541969577850448D+01 + pw(32)= 0.13600906035248579424433723117814169D+01 + pw(33)= 0.16719605033700192144793651594580592D+01 + pw(34)= 0.20715242290059813187830495943964160D+01 + pw(35)= 0.26009149126553161918961209793496358D+01 + pw(36)= 0.33446822473746498943306001989892373D+01 + pw(37)= 0.45163779904182883931050846768355054D+01 + pw(38)= 0.69731288252045659086112524629580346D+01 +endif +if(kn == 39) then + px( 1)= 0.36561668715257863266695867839020084D-03 + px( 2)= 0.19346233268023844607752680140092771D-02 + px( 3)= 0.47914454654721250429608171020940566D-02 + px( 4)= 0.89973768150331118636464557997751850D-02 + px( 5)= 0.14645426459456932116061991386768235D-01 + px( 6)= 0.21867035291160181873696376563265340D-01 + px( 7)= 0.30840905275941465760959181463130484D-01 + px( 8)= 0.41804727278858172089611260923812570D-01 + px( 9)= 0.55070109596011254633584203260158210D-01 + px(10)= 0.71040748192925559559810356000599258D-01 + px(11)= 0.90233607305000360208735980487986444D-01 + px(12)= 0.11330278644013276387294932973826412D+00 + px(13)= 0.14106605202734226946949558771740988D+00 + px(14)= 0.17453471922456742887333856505611605D+00 + px(15)= 0.21494840120203884103453089817606487D+00 + px(16)= 0.26381673974086082406623827494703215D+00 + px(17)= 0.32297045833724193805138734154727157D+00 + px(18)= 0.39462409178162717260419023658129017D+00 + px(19)= 0.48145279757007806226628785793904093D+00 + px(20)= 0.58668591241992432419881469135301121D+00 + px(21)= 0.71422043441619441875490470925759117D+00 + px(22)= 0.86875839397007365611188934469437619D+00 + px(23)= 0.10559731561016268223241013124184012D+01 + px(24)= 0.12827111749931695832113875967148563D+01 + px(25)= 0.15572378120557119987867239447717751D+01 + px(26)= 0.18895389181496921659373840308737441D+01 + px(27)= 0.22916946565666188923157402572614733D+01 + px(28)= 0.27783497396142931899333530655336223D+01 + px(29)= 0.33673171344416414054075236409384439D+01 + px(30)= 0.40803746579421435360342926127423516D+01 + px(31)= 0.49443541982197303363350679681860647D+01 + px(32)= 0.59926991656565097356612507658187921D+01 + px(33)= 0.72678162022521051390335450639073869D+01 + px(34)= 0.88248661080276432761936204880867089D+01 + px(35)= 0.10738377470289614240120984197486960D+02 + px(36)= 0.13114992524642316318648160324994074D+02 + px(37)= 0.16121587864037157860973324745148775D+02 + px(38)= 0.20061517632007615495343028194490514D+02 + px(39)= 0.25677751590102375173572742100364982D+02 + pw( 1)= 0.93922220165197278758235278705125378D-03 + pw( 2)= 0.22050238560788499833482429614154492D-02 + pw( 3)= 0.35187216063020038804486790499409390D-02 + pw( 4)= 0.49085365506617880092326144941967522D-02 + pw( 5)= 0.64093236535073721253961413565734053D-02 + pw( 6)= 0.80634829478885867255854065621590218D-02 + pw( 7)= 0.99236234747694546900134742610818704D-02 + pw( 8)= 0.12055651790884970603397688654161516D-01 + pw( 9)= 0.14542041549622581450180752510105879D-01 + pw(10)= 0.17484998327571243644029476510788471D-01 + pw(11)= 0.21009295862219636601744654763649681D-01 + pw(12)= 0.25264862509860073296774992740630911D-01 + pw(13)= 0.30429647240913452161840939756263150D-01 + pw(14)= 0.36713599504420717007290949145895236D-01 + pw(15)= 0.44364526109307181291869691370038599D-01 + pw(16)= 0.53676231693459070390920038412533532D-01 + pw(17)= 0.64999020675906355562370375168522980D-01 + pw(18)= 0.78752553989958863994856994484156486D-01 + pw(19)= 0.95441198327713183692637121288756175D-01 + pw(20)= 0.11567225339206538896170499893449828D+00 + pw(21)= 0.14017770592600709076251201422928256D+00 + pw(22)= 0.16984043156916862434884153031547730D+00 + pw(23)= 0.20572609864904223612685592628540649D+00 + pw(24)= 0.24912251113833289072621517399741476D+00 + pw(25)= 0.30158889670929686638710684005186011D+00 + pw(26)= 0.36501892348491283180440889799004221D+00 + pw(27)= 0.44172341189659552954879007161166844D+00 + pw(28)= 0.53454253015480261760607954897116394D+00 + pw(29)= 0.64700413373280623862585575306735597D+00 + pw(30)= 0.78355765539183802574608463599148275D+00 + pw(31)= 0.94993752822514424638014213628449162D+00 + pw(32)= 0.11537599191614455897031555123375337D+01 + pw(33)= 0.14055641164435818777056131176331966D+01 + pw(34)= 0.17207623206557334615184702866032957D+01 + pw(35)= 0.21236210670677286367475649834301224D+01 + pw(36)= 0.26563848390712257212223490954097065D+01 + pw(37)= 0.34039597582687380169009802962894561D+01 + pw(38)= 0.45810165654673262153491823163228067D+01 + pw(39)= 0.70497477205153040846297597338553262D+01 +endif +if(kn == 40) then + px( 1)= 0.35628858567646346686837324510549370D-03 + px( 2)= 0.18848536486770311807591243848705546D-02 + px( 3)= 0.46663181280317856746168443158323968D-02 + px( 4)= 0.87572190745324032833162010260613025D-02 + px( 5)= 0.14243053462025729117427416370397854D-01 + px( 6)= 0.21244187714177671012923575144774521D-01 + px( 7)= 0.29923549070178329860704059006245261D-01 + px( 8)= 0.40496827285887221934804797848083118D-01 + px( 9)= 0.53245474481585595162008164942410512D-01 + px(10)= 0.68532582430157398594384714629729053D-01 + px(11)= 0.86821479567961854490494392488927025D-01 + px(12)= 0.10869675959524067911380119096839804D+00 + px(13)= 0.13488761722649275069059959653024014D+00 + px(14)= 0.16629388885455273807852247897310019D+00 + px(15)= 0.20401590432120076472183920609098846D+00 + px(16)= 0.24938984749465339706889280753165521D+00 + px(17)= 0.30403061355446670063478194165459695D+00 + px(18)= 0.36988419909316149270323787724602738D+00 + px(19)= 0.44929166750312582192671647579960907D+00 + px(20)= 0.54506686870623140996608099899850594D+00 + px(21)= 0.66059043398617261201896956069992349D+00 + px(22)= 0.79992312516554243682533149390065526D+00 + px(23)= 0.96794240207734869040410161024196399D+00 + px(24)= 0.11705071340447720718704866053067197D+01 + px(25)= 0.14146568411163837104501550620112388D+01 + px(26)= 0.17088539364613371655586392377700409D+01 + px(27)= 0.20632805527205428543659205472373927D+01 + px(28)= 0.24902063739619416723013658330633921D+01 + px(29)= 0.30044517126674338175114949672570867D+01 + px(30)= 0.36239831566943951780098581431720407D+01 + px(31)= 0.43707017882279510268089904263627705D+01 + px(32)= 0.52715247584257797079839546521646347D+01 + px(33)= 0.63599375545114314246728093354610643D+01 + px(34)= 0.76783457965158492083638000098773087D+01 + px(35)= 0.92818762327705176821137326934557272D+01 + px(36)= 0.11245019162985129683252871377987811D+02 + px(37)= 0.13674441002824139783929760948169562D+02 + px(38)= 0.16737262031081031207287624813172862D+02 + px(39)= 0.20737736626704553460329127303821744D+02 + px(40)= 0.26422185009410264431343933932848600D+02 + pw( 1)= 0.91521311273862401110414707754929676D-03 + pw( 2)= 0.21477191521928219197974773759587103D-02 + pw( 3)= 0.34245277733745781136392870149979323D-02 + pw( 4)= 0.47714307428687057651870425259278087D-02 + pw( 5)= 0.62201720373839252470536835470530964D-02 + pw( 6)= 0.78090816762799270025355432702422127D-02 + pw( 7)= 0.95853728776285881753692296587104674D-02 + pw( 8)= 0.11607827206228475028893871423142139D-01 + pw( 9)= 0.13949671718334027814092300225470600D-01 + pw(10)= 0.16701414648594482319533882783791014D-01 + pw(11)= 0.19973423842092704666661066907283955D-01 + pw(12)= 0.23898237345016617036372142753573396D-01 + pw(13)= 0.28632957303620028137537791558953675D-01 + pw(14)= 0.34362389756581172586707762233025920D-01 + pw(15)= 0.41303629060426009015015262453667296D-01 + pw(16)= 0.49712538117407657728121783442595623D-01 + pw(17)= 0.59892265779927519547458491064456622D-01 + pw(18)= 0.72203793116717427042782050909270304D-01 + pw(19)= 0.87078559354632307141427375007562753D-01 + pw(20)= 0.10503340154082661838457995134010675D+00 + pw(21)= 0.12668825810872029062869005987170137D+00 + pw(22)= 0.15278730399841792433216511718627756D+00 + pw(23)= 0.18422442714170367120354764302353497D+00 + pw(24)= 0.22207427969059917966672557751489214D+00 + pw(25)= 0.26763062752730913486480650477422812D+00 + pw(26)= 0.32245451049240744596849032983153986D+00 + pw(27)= 0.38843603675318055142995014015287773D+00 + pw(28)= 0.46787586661529213993652263387714350D+00 + pw(29)= 0.56359633220992492533897924842606201D+00 + pw(30)= 0.67909910572915995345517074647986275D+00 + pw(31)= 0.81879920277693732798543935642566763D+00 + pw(32)= 0.98838986255598396167621024931216632D+00 + pw(33)= 0.11954429228293928218210248618287692D+01 + pw(34)= 0.14504574076282353803601676013439121D+01 + pw(35)= 0.17688226967485673903033088821262440D+01 + pw(36)= 0.21748255570140773474399755838047824D+01 + pw(37)= 0.27108286283847583355764140776741630D+01 + pw(38)= 0.34620975677969687591016085409314313D+01 + pw(39)= 0.46444137521824838114376662229079725D+01 + pw(40)= 0.71249713212825268738104553603659849D+01 +endif +if(kn == 41) then + px( 1)= 0.34742673101001169298125786189192247D-03 + px( 2)= 0.18376016626066348589572611512977315D-02 + px( 3)= 0.45476582212664459469467031011454626D-02 + px( 4)= 0.85298627941631364768947297314276555D-02 + px( 5)= 0.13862998302969016172574004035541326D-01 + px( 6)= 0.20657586094570613395846290407493021D-01 + px( 7)= 0.29062610848297752288163047809306784D-01 + px( 8)= 0.39274425358723378021088136325757383D-01 + px( 9)= 0.51548104852787877566528890250659503D-01 + px(10)= 0.66211354354233299700140534259286291D-01 + px(11)= 0.83680871232624778086967339679729237D-01 + px(12)= 0.10448092042565902676173295142297193D+00 + px(13)= 0.12926394759018944327153762885108248D+00 + px(14)= 0.15883342588000497563552833153551216D+00 + px(15)= 0.19416971506631978910646170849840836D+00 + px(16)= 0.23646026989593002896655227863024343D+00 + px(17)= 0.28713586785719640965554266593644607D+00 + px(18)= 0.34791462052391896376462581083932962D+00 + px(19)= 0.42085552853141849349998238918417102D+00 + px(20)= 0.50842339932407666570140739641291868D+00 + px(21)= 0.61356716058210435249023757196485689D+00 + px(22)= 0.73981399451706291428067431492187345D+00 + px(23)= 0.89138229125166542251073985530751701D+00 + px(24)= 0.10733171950376849976883148880289368D+01 + px(25)= 0.12916535619608503598632452641060769D+01 + px(26)= 0.15536125962495445292827177181517718D+01 + px(27)= 0.18678405221272713224762723085314113D+01 + px(28)= 0.22447007897223064544921318025044505D+01 + px(29)= 0.26966362208010185020303083586792707D+01 + px(30)= 0.32386254398637317225899745563640301D+01 + px(31)= 0.38887712166747737454815122744695001D+01 + px(32)= 0.46690813167550880509031095796259491D+01 + px(33)= 0.56065436673995350526181894949125319D+01 + px(34)= 0.67346748355004755553478375342343373D+01 + px(35)= 0.80958733206377174584850755308484861D+01 + px(36)= 0.97452318197102039374768068435419398D+01 + px(37)= 0.11757209117436549173752389088784801D+02 + px(38)= 0.14238509041698890616269777375250039D+02 + px(39)= 0.17356513104267573046562740943972043D+02 + px(40)= 0.21416391258741087275776630427364534D+02 + px(41)= 0.27167802059826190961056594217938860D+02 + pw( 1)= 0.89240744144922233411809208993761819D-03 + pw( 2)= 0.20933555605342582529457970383304286D-02 + pw( 3)= 0.33353720205083480430061442115773668D-02 + pw( 4)= 0.46420859644861606778799308270413399D-02 + pw( 5)= 0.60424942046045364422720752662654441D-02 + pw( 6)= 0.75713643924191847990582966217707736D-02 + pw( 7)= 0.92712216138022670728066109402065568D-02 + pw( 8)= 0.11194687905391631640874958915231129D-01 + pw( 9)= 0.13407015211365284658909716719520612D-01 + pw(10)= 0.15988623214362774067334406180515621D-01 + pw(11)= 0.19037444847571012625771486857897508D-01 + pw(12)= 0.22671015569010270758765332286863717D-01 + pw(13)= 0.27028523256092179219838374149240132D-01 + pw(14)= 0.32273327230549150524506142920429224D-01 + pw(15)= 0.38596560710149393493004578367090095D-01 + pw(16)= 0.46222282500302385970118307137327059D-01 + pw(17)= 0.55414377032731402665476377100171599D-01 + pw(18)= 0.66485217878875067216476264141658569D-01 + pw(19)= 0.79806099857296581997108727302834078D-01 + pw(20)= 0.95819567831341990475832520086841170D-01 + pw(21)= 0.11505394471321825395206987370902480D+00 + pw(22)= 0.13814054039338535506658629671808687D+00 + pw(23)= 0.16583420916823077593541262981490681D+00 + pw(24)= 0.19903814998014368860888042911909173D+00 + pw(25)= 0.23883416675372027779180574899396133D+00 + pw(26)= 0.28652010702264135124796999487042423D+00 + pw(27)= 0.34365700663826115610743525613053815D+00 + pw(28)= 0.41212981100557300459399743393003457D+00 + pw(29)= 0.49422782047790986141383860345034700D+00 + pw(30)= 0.59275496121528084438932784828140258D+00 + pw(31)= 0.71118703347840582157967515903715531D+00 + pw(32)= 0.85390607737164494406520343373727721D+00 + pw(33)= 0.10265669138579149759586965414767668D+01 + pw(34)= 0.12367012590999922651850699786000809D+01 + pw(35)= 0.14947734294405908884100490275594732D+01 + pw(36)= 0.18161565391675246807943861856738552D+01 + pw(37)= 0.22251644165626364386518263812767700D+01 + pw(38)= 0.27642836611922099334373971118073338D+01 + pw(39)= 0.35191416662378097344643779658011469D+01 + pw(40)= 0.47066216438742057965343731115092608D+01 + pw(41)= 0.71988567667853812451766331822159390D+01 +endif +if(kn == 42) then + px( 1)= 0.33899702213207280303556565703741990D-03 + px( 2)= 0.17926803514586918122543179233298713D-02 + px( 3)= 0.44349720472477033588788261295928634D-02 + px( 4)= 0.83142928381370954944600712429031326D-02 + px( 5)= 0.13503405398912139202218387144909328D-01 + px( 6)= 0.20104047687222803992574530787065418D-01 + px( 7)= 0.28252833643724993821423023769509983D-01 + px( 8)= 0.38129051839561461833536104861505433D-01 + px( 9)= 0.49964611823304399745794720191846551D-01 + px(10)= 0.64056255466771758626847757441909748D-01 + px(11)= 0.80779982442382913230902383270370264D-01 + px(12)= 0.10060749467110151157610640414769097D+00 + px(13)= 0.12412446888143911749100232205634547D+00 + px(14)= 0.15205071694865663047292718989423185D+00 + px(15)= 0.18526275842687218439176583557408384D+00 + px(16)= 0.22481983515906206676758114216568014D+00 + px(17)= 0.27199475262711723613680204884879081D+00 + px(18)= 0.32831107191481630404052581830045252D+00 + px(19)= 0.39558818386929789909718409480658300D+00 + px(20)= 0.47599581233231530250697095670252798D+00 + px(21)= 0.57211961616026616366050109797592391D+00 + px(22)= 0.68703982771101430011400322499023268D+00 + px(23)= 0.82442528240215569955771735381318182D+00 + px(24)= 0.98864576667159865631116805089235696D+00 + px(25)= 0.11849063745873302190560203655789150D+01 + px(26)= 0.14194085949505718343903162487063753D+01 + px(27)= 0.16995442942632463982698312503343140D+01 + px(28)= 0.20341308620656637127889186046327503D+01 + px(29)= 0.24336989628881715953740329050451161D+01 + px(30)= 0.29108493179186889815360617341447733D+01 + px(31)= 0.34807029996457758665062194913386789D+01 + px(32)= 0.41614831892896507985708808145127823D+01 + px(33)= 0.49752895914290239340663575251573810D+01 + px(34)= 0.59491683086745559122027634515663744D+01 + px(35)= 0.71166577579837023101788394059159047D+01 + px(36)= 0.85201446660709564225208283238116453D+01 + px(37)= 0.10214688587324610607847909840137625D+02 + px(38)= 0.12274723743508615874901666750507663D+02 + px(39)= 0.14807003938009410302904866873102566D+02 + px(40)= 0.17979187958243548948159702687043315D+02 + px(41)= 0.22097374571880291313408725390611447D+02 + px(42)= 0.27914546740390271130933515005738687D+02 + pw( 1)= 0.87071685062486345289172157206777171D-03 + pw( 2)= 0.20417105878963868357298871515770460D-02 + pw( 3)= 0.32508535230574150323908782582998050D-02 + pw( 4)= 0.45198431212516341141392418533095799D-02 + pw( 5)= 0.58752408889593859079959250826252330D-02 + pw( 6)= 0.73486832934924937540882060804195150D-02 + pw( 7)= 0.89786045784956752332612520710862566D-02 + pw( 8)= 0.10812279756759579335639253148771645D-01 + pw( 9)= 0.12908056239717444457676015564846892D-01 + pw(10)= 0.15337623022218768136530241218842739D-01 + pw(11)= 0.18188143417513304840817866168341234D-01 + pw(12)= 0.21564159053085002516938026776853287D-01 + pw(13)= 0.25589384327944258948303670138994570D-01 + pw(14)= 0.30408768217117848936156421895538494D-01 + pw(15)= 0.36191344708155839281431088511826248D-01 + pw(16)= 0.43134326776658801296020544503200249D-01 + pw(17)= 0.51468687405796338041097105695766697D-01 + pw(18)= 0.61466278505771427877600196882159870D-01 + pw(19)= 0.73448477231342626126309188685807545D-01 + pw(20)= 0.87796417742468560052699023987407758D-01 + pw(21)= 0.10496300171125714320281462758628502D+00 + pw(22)= 0.12548703017795381911118827176515123D+00 + pw(23)= 0.15000994875313221416387070158771740D+00 + pw(24)= 0.17929586510892162331076812701124762D+00 + pw(25)= 0.21425571841544983392913772225490756D+00 + pw(26)= 0.25597680754299265400920520155620068D+00 + pw(27)= 0.30575939808892163354965075699382712D+00 + pw(28)= 0.36516295819168300959340602426394554D+00 + pw(29)= 0.43606594548816172375527529406447798D+00 + pw(30)= 0.52074538635856855968547701789163874D+00 + pw(31)= 0.62198649880275266624161896479324475D+00 + pw(32)= 0.74323973657036909817079545894769618D+00 + pw(33)= 0.88885572273739267168333040860537136D+00 + pw(34)= 0.10644536078580082753759483481056876D+01 + pw(35)= 0.12775289785980491630656102334631630D+01 + pw(36)= 0.15385165702010908217004674883905600D+01 + pw(37)= 0.18627791456676687262611182457669180D+01 + pw(38)= 0.22746636192228746631618234661193503D+01 + pw(39)= 0.28167854835143406837432208820997977D+01 + pw(40)= 0.35751353025579259567249078677450433D+01 + pw(41)= 0.47676889791677097956471776553934610D+01 + pw(42)= 0.72714575121908501417344894592801099D+01 +endif +if(kn == 43) then + px( 1)= 0.33096860786757750154898847120917057D-03 + px( 2)= 0.17499208566123461094898550046151874D-02 + px( 3)= 0.43278154096847331355188904178071447D-02 + px( 4)= 0.81096003646317825364067625754120221D-02 + px( 5)= 0.13162623744349233672062735003036974D-01 + px( 6)= 0.19580760979979784765554480104376286D-01 + px( 7)= 0.27489609951663532195103068673087354D-01 + px( 8)= 0.37053342869799653592283933396820960D-01 + px( 9)= 0.48483444452707627033448163720522761D-01 + px(10)= 0.62049459953072389488156028383179189D-01 + px(11)= 0.78091737934971368286739663026054961D-01 + px(12)= 0.97036017163386886705197231591959380D-01 + px(13)= 0.11940967168918372175774848212559756D+00 + px(14)= 0.14585958712300992040134315459436264D+00 + px(15)= 0.17717200022543900754299287107723618D+00 + px(16)= 0.21429507577429224752942159606670995D+00 + px(17)= 0.25836535107930525415060613221934697D+00 + px(18)= 0.31073935543877479582966549813156744D+00 + px(19)= 0.37303174367650550564362643090028699D+00 + px(20)= 0.44716127946900452807387231860775343D+00 + px(21)= 0.53540606537110705599704958284246183D+00 + px(22)= 0.64046959170961720285075028881968045D+00 + px(23)= 0.76555947611245457114222548768598476D+00 + px(24)= 0.91448119032995475750346963701721081D+00 + px(25)= 0.10917496366295753151128433644125688D+01 + px(26)= 0.13027221869155879323859021531153006D+01 + px(27)= 0.15537578204695570075088633454278321D+01 + px(28)= 0.18524084405877676482171967152697008D+01 + px(29)= 0.22076505684206197863173711190960219D+01 + px(30)= 0.26301688303884504911030778880813732D+01 + px(31)= 0.31327077074207879423269172652003364D+01 + px(32)= 0.37305161917200709143270025763858640D+01 + px(33)= 0.44419236396092482776254163193454757D+01 + px(34)= 0.52891086092067566005114585507151512D+01 + px(35)= 0.62991644484541209348967251319712091D+01 + px(36)= 0.75056437278338836321053009023954332D+01 + px(37)= 0.89509179672327616775374703555124487D+01 + px(38)= 0.10690015208487653327362214867738207D+02 + px(39)= 0.12797352129263264861734948864857745D+02 + px(40)= 0.15379744394428552764554128123427935D+02 + px(41)= 0.18605142714959248273487455172506479D+02 + px(42)= 0.22780586028957730000926904102977113D+02 + px(43)= 0.28662366237232832381924388811320297D+02 + pw( 1)= 0.85006145870848429334757469982573891D-03 + pw( 2)= 0.19925838109657703878469708707415329D-02 + pw( 3)= 0.31706135365278565227115398483457554D-02 + pw( 4)= 0.44041172343279662518790702958553625D-02 + pw( 5)= 0.57174896648731729427179253475841935D-02 + pw( 6)= 0.71396041280184287136596813041547263D-02 + pw( 7)= 0.87053105293648014281590926123488040D-02 + pw( 8)= 0.10457223593582426133616067442291247D-01 + pw( 9)= 0.12447691003362071183889633075575652D-01 + pw(10)= 0.14740821116468595319728157346456377D-01 + pw(11)= 0.17414420402114891406970230872875573D-01 + pw(12)= 0.20561736545282396913149978083028889D-01 + pw(13)= 0.24293057998247351709738591334952020D-01 + pw(14)= 0.28737438822145949516145087089517893D-01 + pw(15)= 0.34044977279791239841839057487094614D-01 + pw(16)= 0.40390072912737209513481740353650117D-01 + pw(17)= 0.47975933504865389303614731289416436D-01 + pw(18)= 0.57040421240269648291641111667509080D-01 + pw(19)= 0.67863232374253637764436196090399013D-01 + pw(20)= 0.80774426563059878523617995686503505D-01 + pw(21)= 0.96164420051293883535914595526514204D-01 + pw(22)= 0.11449568019439733156432157598398321D+00 + pw(23)= 0.13631648279559252800439663886965342D+00 + pw(24)= 0.16227722290947194423527738296713650D+00 + pw(25)= 0.19314992693280818601029240961091576D+00 + pw(26)= 0.22985183401280394837726875217480564D+00 + pw(27)= 0.27347424857196858115127321375763169D+00 + pw(28)= 0.32531839177175415639431272795218179D+00 + pw(29)= 0.38694082912221267186717847996208730D+00 + pw(30)= 0.46021245154573169215949688893911839D+00 + pw(31)= 0.54739734403912358955407316606691874D+00 + pw(32)= 0.65126193729202891760603881973382504D+00 + pw(33)= 0.77523203175189474974013051456241330D+00 + pw(34)= 0.92362846689464044743151918155531626D+00 + pw(35)= 0.11020374135362457925484267398369432D+01 + pw(36)= 0.13179220978816197700516342241847467D+01 + pw(37)= 0.15816923933252917714931130477654841D+01 + pw(38)= 0.19087060424955494066311220499293247D+01 + pw(39)= 0.23233483559535813743753279936284135D+01 + pw(40)= 0.28683678978287494020672145638838434D+01 + pw(41)= 0.36301191889236335565768841054222539D+01 + pw(42)= 0.48276614242022139042477288493084378D+01 + pw(43)= 0.73428236438762783138460481364952627D+01 +endif +if(kn == 44) then + px( 1)= 0.32331350836957546603744084419849788D-03 + px( 2)= 0.17091703169716826666769331280363544D-02 + px( 3)= 0.42257875064315455206561743770986376D-02 + px( 4)= 0.79149690832372160827916911872302865D-02 + px( 5)= 0.12839179023186853233140061142661062D-01 + px( 6)= 0.19085232241568949577899855509474977D-01 + px( 7)= 0.26768883186791699077493305776558926D-01 + px( 8)= 0.36040864557027892650839059295103373D-01 + px( 9)= 0.47094585778667144264845412823636449D-01 + px(10)= 0.60175615761544264401796703227341314D-01 + px(11)= 0.75592961225388113581704029754538741D-01 + px(12)= 0.93732029387412788080736336661325883D-01 + px(13)= 0.11506910593622805003749038841255850D+00 + px(14)= 0.14018727035639085866142640394142696D+00 + px(15)= 0.16979393946354437271616691990993614D+00 + px(16)= 0.20474060430292067802523043906109989D+00 + px(17)= 0.24604566740448100845633933054720796D+00 + px(18)= 0.29492149096685690036119497895105667D+00 + px(19)= 0.35280682708785925696562700896642017D+00 + px(20)= 0.42140579664772257270434756016372204D+00 + px(21)= 0.50273460611520704557595850856713180D+00 + px(22)= 0.59917729885865279633574477547906320D+00 + px(23)= 0.71355204803416102949757204396579206D+00 + px(24)= 0.84918981260514039761248477195520438D+00 + px(25)= 0.10100276016887074982125673033831345D+01 + px(26)= 0.12007191488994777606413935958511916D+01 + px(27)= 0.14267665395522811775994265815554724D+01 + px(28)= 0.16946773518359189852395063278947950D+01 + px(29)= 0.20121533231038550963171865171844820D+01 + px(30)= 0.23883186915697571717337880438736199D+01 + px(31)= 0.28339996248002878664983052972434529D+01 + px(32)= 0.33620712713198530053650295556681017D+01 + px(33)= 0.39878972622760453246577762221089257D+01 + px(34)= 0.47299002992789324698627794306541718D+01 + px(35)= 0.56103262027415485179449762966436919D+01 + px(36)= 0.66563061139504917392538125507749794D+01 + px(37)= 0.79014003881557845126238132373008625D+01 + px(38)= 0.93879629282953731177093056973506411D+01 + px(39)= 0.11170992458953013913700288894894255D+02 + px(40)= 0.13324895111520577881664368628734089D+02 + px(41)= 0.15956559624176004069472090284905753D+02 + px(42)= 0.19234241988629565032886618545011883D+02 + px(43)= 0.23465930980103016989495364982244146D+02 + px(44)= 0.29411210662212999597346209126697707D+02 + pw( 1)= 0.83036884987403513181911166534208663D-03 + pw( 2)= 0.19457941864115460112729277032771097D-02 + pw( 3)= 0.30943299472937825975912487677647078D-02 + pw( 4)= 0.42943871620956718877444401513682917D-02 + pw( 5)= 0.55684261348013693314562032089920088D-02 + pw( 6)= 0.69428725530123470368722128932621244D-02 + pw( 7)= 0.84494234604398957532432243635753483D-02 + pw( 8)= 0.10126616172976027067397699017267216D-01 + pw( 9)= 0.12021566160747622141057949671782224D-01 + pw(10)= 0.14191778414242213809164508501045497D-01 + pw(11)= 0.16706905485358088359523445571590018D-01 + pw(12)= 0.19650346938023694904494456157357191D-01 + pw(13)= 0.23120695042785977245044097938079732D-01 + pw(14)= 0.27233209875082837015975056631389154D-01 + pw(15)= 0.32121665600246232435212867479076854D-01 + pw(16)= 0.37940949845194110884289250317530407D-01 + pw(17)= 0.44870698984096406199701289089460328D-01 + pw(18)= 0.53120093303752446444612781898982047D-01 + pw(19)= 0.62933823516449817737731500047201570D-01 + pw(20)= 0.74599224373678150636698930396457382D-01 + pw(21)= 0.88454634536330462816566048592135618D-01 + pw(22)= 0.10489914083790778708121373941299966D+00 + pw(23)= 0.12440396953470249784020277878699461D+00 + pw(24)= 0.14752589155074737609776847913675882D+00 + pw(25)= 0.17492312579420798336376479913219788D+00 + pw(26)= 0.20737437787760038519638439806427540D+00 + pw(27)= 0.24580187433567244121381237836878940D+00 + pw(28)= 0.29129959389673368311222620480570831D+00 + pw(29)= 0.34516843605203008284533614069195425D+00 + pw(30)= 0.40896093500380134048288051036096437D+00 + pw(31)= 0.48453955284019804235911876681118743D+00 + pw(32)= 0.57415497731524515579134334664918167D+00 + pw(33)= 0.68055495684587130214889340452349758D+00 + pw(34)= 0.80714146055673592921708826105860249D+00 + pw(35)= 0.95820721891454345652447310233062356D+00 + pw(36)= 0.11393080348437704220650007990276423D+01 + pw(37)= 0.13578783180672521841897407574039451D+01 + pw(38)= 0.16243074162684344243952826001273292D+01 + pw(39)= 0.19539528530618256474917072512577991D+01 + pw(40)= 0.23712430126501098428281754997758214D+01 + pw(41)= 0.29190630527561174415810627420409994D+01 + pw(42)= 0.36841316877299013922727992267156054D+01 + pw(43)= 0.48865818259133309055220603437140743D+01 + pw(44)= 0.74130021614058319715789731577046943D+01 +endif +if(kn == 45) then + px( 1)= 0.31600628834973779292559127048190037D-03 + px( 2)= 0.16702900106190273328935895866739893D-02 + px( 3)= 0.41285257114804008864642830706317964D-02 + px( 4)= 0.77296636135218507544800673352591724D-02 + px( 5)= 0.12531750213154611452085965505028540D-01 + px( 6)= 0.18615241131540077290889348326202186D-01 + px( 7)= 0.26087066631843252886061485477515189D-01 + px( 8)= 0.35085969566222178513934460224332887D-01 + px( 9)= 0.45789306577033888813174278261897663D-01 + px(10)= 0.58421434435784388362459047968056087D-01 + px(11)= 0.73263711622882621260808485115770070D-01 + px(12)= 0.90666037430716786669379135384670844D-01 + px(13)= 0.11105978210403381887459852100321712D+00 + px(14)= 0.13497200418547898709664745719407710D+00 + px(15)= 0.16304104546216750230118904547804422D+00 + px(16)= 0.19603390202336562386932779699909816D+00 + px(17)= 0.23486608544820513725469302847128384D+00 + px(18)= 0.28062490631545969684155812688765467D+00 + px(19)= 0.33459720050887095268667406972580638D+00 + px(20)= 0.39830252448023971951147973257838648D+00 + px(21)= 0.47353284765622731594152923198004578D+00 + px(22)= 0.56239982933240531713761978532829046D+00 + px(23)= 0.66739091019498584233201402129369733D+00 + px(24)= 0.79143567909811363215071199148744656D+00 + px(25)= 0.93798429449898007763809445265874043D+00 + px(26)= 0.11111001581385252565298438994097319D+01 + px(27)= 0.13155695863345333806714751926528375D+01 + px(28)= 0.15570319587074322621285835341493748D+01 + px(29)= 0.18421348417902183390620002381054988D+01 + px(30)= 0.21787200444979152172206336828110265D+01 + px(31)= 0.25760487257631498506605264603704116D+01 + px(32)= 0.30450769803228709332081128619189764D+01 + px(33)= 0.35987985256158263799485414495379126D+01 + px(34)= 0.42526795117768400994862312123212388D+01 + px(35)= 0.50252244422171176606674140586086445D+01 + px(36)= 0.59387361521365530542536690567593329D+01 + px(37)= 0.70203754298618509877391417240792276D+01 + px(38)= 0.83037051949719291418317245580593492D+01 + px(39)= 0.98310601809174348734008913125923897D+01 + px(40)= 0.11657412426662143324540788749464127D+02 + px(41)= 0.13857164426266931434200566791289955D+02 + px(42)= 0.16537288540540952756080764303901777D+02 + px(43)= 0.19866358206258941542582357564294186D+02 + px(44)= 0.24153320185396072162240284411839110D+02 + px(45)= 0.30161032818176703094230836042654050D+02 + pw( 1)= 0.81157322026241886094392714295783692D-03 + pw( 2)= 0.19011777475163676304804330211208542D-02 + pw( 3)= 0.30217126568442239903479316417011601D-02 + pw( 4)= 0.41901870013204838998208477401165665D-02 + pw( 5)= 0.54273283828323581210343431353844446D-02 + pw( 6)= 0.67573866388465515363650925512746062D-02 + pw( 7)= 0.82092751225427288897814574011554460D-02 + pw( 8)= 0.98179504955536258107199257981433439D-02 + pw( 9)= 0.11625949429528062575538287604998941D-01 + pw(10)= 0.13685006654390859174410475886206370D-01 + pw(11)= 0.16057648452306267238525538418198228D-01 + pw(12)= 0.18818661412340581662760322741958838D-01 + pw(13)= 0.22056412054981119160700553625749748D-01 + pw(14)= 0.25874134633155885709619608590472202D-01 + pw(15)= 0.30391451859719384524106580753360271D-01 + pw(16)= 0.35746460932048448724214073937351505D-01 + pw(17)= 0.42098664685115079575098628197326142D-01 + pw(18)= 0.49632898864791727296990224145235518D-01 + pw(19)= 0.58564289944594589064829987997677920D-01 + pw(20)= 0.69144234514984419908558444457124034D-01 + pw(21)= 0.81667423637721859042077146566843517D-01 + pw(22)= 0.96480011247988074883978780802066204D-01 + pw(23)= 0.11398911344507944971212279753122991D+00 + pw(24)= 0.13467391267458107409324694436955753D+00 + pw(25)= 0.15909873207292332703499649578337212D+00 + pw(26)= 0.18792855598692739186974920544448162D+00 + pw(27)= 0.22194762553095863671304473597844931D+00 + pw(28)= 0.26208196494751982936477406183051153D+00 + pw(29)= 0.30942704406840347450420829122426871D+00 + pw(30)= 0.36528233311686115136869183803961988D+00 + pw(31)= 0.43119539125683582249214467187083528D+00 + pw(32)= 0.50901957973734783536801637831942790D+00 + pw(33)= 0.60099191357850333533977421143095798D+00 + pw(34)= 0.70984171451926250865508439176773176D+00 + pw(35)= 0.83894803839540918949226578045417203D+00 + pw(36)= 0.99257719538081268693583754948673040D+00 + pw(37)= 0.11762571380368294152931731126534612D+01 + pw(38)= 0.13973967807100820386062004887130792D+01 + pw(39)= 0.16663689232723581345802345799199851D+01 + pw(40)= 0.19985351917633034604206124546990186D+01 + pw(41)= 0.24183711599931949498296065604711404D+01 + pw(42)= 0.29689015304223315699189706042837967D+01 + pw(43)= 0.37372089827137538865937716176457237D+01 + pw(44)= 0.49444904398530770204722137069339350D+01 + pw(45)= 0.74820372303276097896128738491191685D+01 +endif +if(kn == 46) then + px( 1)= 0.30902377399120893788354424898855426D-03 + px( 2)= 0.16331537506671398843215268321368443D-02 + px( 3)= 0.40357010980175320995569579449408174D-02 + px( 4)= 0.75530195763806797812195436379199476D-02 + px( 5)= 0.12239149854724502952182976196758722D-01 + px( 6)= 0.18168803621919060471207612598191755D-01 + px( 7)= 0.25440976363694145320552734001486569D-01 + px( 8)= 0.34183679422904259346408703316308100D-01 + px( 9)= 0.44559964643964573543760075913259475D-01 + px(10)= 0.56775358517699538859278511856645348D-01 + px(11)= 0.71086748757462020933356688085151754D-01 + px(12)= 0.87812673145710441814424785984121246D-01 + px(13)= 0.10734488854150171164189046430233515D+00 + px(14)= 0.13016110718036063404346759877348097D+00 + px(15)= 0.15683892196176944176774935350653314D+00 + px(16)= 0.18807118924767432734960867652772029D+00 + px(17)= 0.22468341948583423108691031411123462D+00 + px(18)= 0.26765394743983501400088905846999460D+00 + px(19)= 0.31813776531045013135926232571664330D+00 + px(20)= 0.37749492399880062738311332663588965D+00 + px(21)= 0.44732440247018853385136813722855067D+00 + px(22)= 0.52950437210775343555603255060398518D+00 + px(23)= 0.62623987488687805361419457851831370D+00 + px(24)= 0.74011910006308993968762196795007188D+00 + px(25)= 0.87417968361959824522374584708126947D+00 + px(26)= 0.10319867719667534605241559659266134D+01 + px(27)= 0.12177250028776661981781648200867157D+01 + px(28)= 0.14363070973204841479279696750353849D+01 + px(29)= 0.16935024864806789035793857077605729D+01 + px(30)= 0.19960904185444808460178685393801009D+01 + px(31)= 0.23520434618090888941520463681053941D+01 + px(32)= 0.27707495103418382337989450786653183D+01 + px(33)= 0.32632837492864188626066742354315931D+01 + px(34)= 0.38427473041855077266339137871017428D+01 + px(35)= 0.45246978009253960226440639549407685D+01 + px(36)= 0.53277111750428588477775437304742452D+01 + px(37)= 0.62741382355089597665586817773799499D+01 + px(38)= 0.73911624234084726194161817156844889D+01 + px(39)= 0.87123449938120379656808053147144537D+01 + px(40)= 0.10280000673796667050592943428615469D+02 + px(41)= 0.12149077777983097408939533602647387D+02 + px(42)= 0.14393981931423795298134131653931530D+02 + px(43)= 0.17121779022116518315203538273624565D+02 + px(44)= 0.20501370994989532793115106347627070D+02 + px(45)= 0.24842669385551363138030331359754804D+02 + px(46)= 0.30911787987864913738727989247192145D+02 + pw( 1)= 0.79361463897360195929686273302248409D-03 + pw( 2)= 0.18585856237700307630029880256700761D-02 + pw( 3)= 0.29524996535416808392477309033103772D-02 + pw( 4)= 0.40910988545579399083185141944050702D-02 + pw( 5)= 0.52935540515794390997929859327326122D-02 + pw( 6)= 0.65821742586378490181117941215141045D-02 + pw( 7)= 0.79834063288880762331149927032946792D-02 + pw( 8)= 0.95290513069482303557158918089126201D-02 + pw( 9)= 0.11257625314697109024742951848074159D-01 + pw(10)= 0.13215805246967286000173950239611213D-01 + pw(11)= 0.15459871723048351392169871195100160D-01 + pw(12)= 0.18057057593147761999943506400696980D-01 + pw(13)= 0.21086760440076034016469007977829453D-01 + pw(14)= 0.24641687279129405797304341158994351D-01 + pw(15)= 0.28829130387470289935818007648098355D-01 + pw(16)= 0.33772655230285352209194232463306431D-01 + pw(17)= 0.39614465986840122572670076622475638D-01 + pw(18)= 0.46518617394519992951670269787487787D-01 + pw(19)= 0.54675131186708511535131483491430821D-01 + pw(20)= 0.64305014007496806948186778346285377D-01 + pw(21)= 0.75666179654741774777744798242374952D-01 + pw(22)= 0.89060332032660676125924796117040007D-01 + pw(23)= 0.10484093752103114356647229133814935D+00 + pw(24)= 0.12342248951027967461470307235875341D+00 + pw(25)= 0.14529134199630383752180561542530020D+00 + pw(26)= 0.17101847250539549716018144547422270D+00 + pw(27)= 0.20127464277089058198410341836023683D+00 + pw(28)= 0.23684858001359149093932878615193101D+00 + pw(29)= 0.27866903348006793962100039913921415D+00 + pw(30)= 0.32783191849208153321924464180174769D+00 + pw(31)= 0.38563432289935890460881756854144066D+00 + pw(32)= 0.45361805217372703197887339399899876D+00 + pw(33)= 0.53362686058844350813392769068823926D+00 + pw(34)= 0.62788396771362463542546047416846800D+00 + pw(35)= 0.73910064578742344795091934329070699D+00 + pw(36)= 0.87063402416829911198578365788259413D+00 + pw(37)= 0.10267256741807495552845728772900802D+01 + pw(38)= 0.12128781106626951287021151112542470D+01 + pw(39)= 0.14364778560404412196974965116370614D+01 + pw(40)= 0.17078848068460459936312054922566995D+01 + pw(41)= 0.20424685786283419841913953641602057D+01 + pw(42)= 0.24647555529206493569854377641616308D+01 + pw(43)= 0.30179124308683067944200720815335092D+01 + pw(44)= 0.37893852356394974569134386395867514D+01 + pw(45)= 0.50014251355936200780723745695031871D+01 + pw(46)= 0.75499704094498894150628414754712103D+01 +endif +if(kn == 47) then + px( 1)= 0.30234480684949870042066556848012510D-03 + px( 2)= 0.15976464953770958135599778441796202D-02 + px( 3)= 0.39470145814685269106194100213451950D-02 + px( 4)= 0.73844351214347704596731706115099534D-02 + px( 5)= 0.11960307325670953208852468939751834D-01 + px( 6)= 0.17744140854528926114461559850667081D-01 + px( 7)= 0.24827775421109866384949286855042231D-01 + px( 8)= 0.33329587337597891764863922994014555D-01 + px( 9)= 0.43399840203304717839217570161011672D-01 + px(10)= 0.55227290263964079065971805634237348D-01 + px(11)= 0.69047097585245923081163450963219921D-01 + px(12)= 0.85150014954361349896311981444336255D-01 + px(13)= 0.10389275615680304450176721539326430D+00 + px(14)= 0.12570943253264044220914087598374078D+00 + px(15)= 0.15112403488297752197534393801499475D+00 + px(16)= 0.18076413027130191024242456354592003D+00 + px(17)= 0.21537618776652269918043941270387835D+00 + px(18)= 0.25584316244558279667676325421037561D+00 + px(19)= 0.30320509530959575930710602679134976D+00 + px(20)= 0.35868352704264273864602902528848421D+00 + px(21)= 0.42371052026058185226325968835995444D+00 + px(22)= 0.49996309228509504682760172037312828D+00 + px(23)= 0.58940391513730639900316144249394630D+00 + px(24)= 0.69432925544472178824662119719828653D+00 + px(25)= 0.81742530558526965352748082134155990D+00 + px(26)= 0.96183429918308909086642705501460506D+00 + px(27)= 0.11312321171355587417637105018144043D+01 + px(28)= 0.13299194954977380580964549469885521D+01 + px(29)= 0.15629294821033111247460192707385455D+01 + px(30)= 0.18361545182421012128782235750936341D+01 + px(31)= 0.21564975473013180685304927966921050D+01 + px(32)= 0.25320530389913426066138968271032273D+01 + px(33)= 0.29723260351325192884401956439321237D+01 + px(34)= 0.34885007189662000039152029330111157D+01 + px(35)= 0.40937753452277820469237089959367401D+01 + px(36)= 0.48037889682999426543864264911219419D+01 + px(37)= 0.56371796552402367761125198516216112D+01 + px(38)= 0.66163382295353580459275111274902267D+01 + px(39)= 0.77684648059929853861219213698259439D+01 + px(40)= 0.91271156012149482363243828361549091D+01 + px(41)= 0.10734585093601096152552856517558275D+02 + px(42)= 0.12645801076306643342528896213826369D+02 + px(43)= 0.14935178894323617316649392464504258D+02 + px(44)= 0.17709887244612807742256277065719222D+02 + px(45)= 0.21139166627448714331281414920298006D+02 + px(46)= 0.25533898913637103231788330664255958D+02 + px(47)= 0.31663433742152343555689779493983534D+02 + pw( 1)= 0.77643840612392469558233568878288438D-03 + pw( 2)= 0.18178823313702654997790171434491737D-02 + pw( 3)= 0.28864536542127938374932216849800024D-02 + pw( 4)= 0.39967467153526294501560789189634526D-02 + pw( 5)= 0.51665295391631236792192347882343893D-02 + pw( 6)= 0.64163743838733321438814922251353746D-02 + pw( 7)= 0.77705352297994246418018494260403994D-02 + pw( 8)= 0.92580225788462023682293420241402079D-02 + pw( 9)= 0.10913810605238600296521875414785753D-01 + pw(10)= 0.12780129454852165115348470540659794D-01 + pw(11)= 0.14907770873253776058864543333587366D-01 + pw(12)= 0.17357325452550337474212111231031977D-01 + pw(13)= 0.20200301191038969448754000899427036D-01 + pw(14)= 0.23520156080274576647610305543693418D-01 + pw(15)= 0.27413389239659681777619137570103249D-01 + pw(16)= 0.31990921824324695951826704036400849D-01 + pw(17)= 0.37380010931288310349162200884914999D-01 + pw(18)= 0.43726873583833196750460928410582440D-01 + pw(19)= 0.51200100443073974036149329120700021D-01 + pw(20)= 0.59994868260574886373334055836155630D-01 + pw(21)= 0.70337945020122803930065727247244319D-01 + pw(22)= 0.82493514750755835638795021587104324D-01 + pw(23)= 0.96769906495425836064670059167481399D-01 + pw(24)= 0.11352737495114584133685628457043583D+00 + pw(25)= 0.13318714247714292913023639085868125D+00 + pw(26)= 0.15624197763113426788311768418626933D+00 + pw(27)= 0.18326866459826594224170442595536027D+00 + pw(28)= 0.21494282570862350034472593398023850D+00 + pw(29)= 0.25205671631617380517032575867290262D+00 + pw(30)= 0.29554084830882541265073842592331588D+00 + pw(31)= 0.34649066398539118260401510223685881D+00 + pw(32)= 0.40620005573936491411597998731411544D+00 + pw(33)= 0.47620444337408185641294547642135994D+00 + pw(34)= 0.55833761203594537949491272468091006D+00 + pw(35)= 0.65480899160066358211037008014529884D+00 + pw(36)= 0.76831227871509708311671525382157320D+00 + pw(37)= 0.90218370924525609794334519875989345D+00 + pw(38)= 0.10606417731219318086869090064772246D+01 + pw(39)= 0.12491658485426170682375819102086492D+01 + pw(40)= 0.14751229591934938816623767074938966D+01 + pw(41)= 0.17488634335620615325269065049873221D+01 + pw(42)= 0.20857683710668100441568568919092486D+01 + pw(43)= 0.25104181373755596577440533667319287D+01 + pw(44)= 0.30661234529278624717819692903450825D+01 + pw(45)= 0.38406927297086597567375854792414220D+01 + pw(46)= 0.50574215821071962594952640986635070D+01 + pw(47)= 0.76168408553950413361259182112809766D+01 +endif +if(kn == 48) then + px( 1)= 0.29595002899720089598048509729240858D-03 + px( 2)= 0.15636631385928177658857961799981344D-02 + px( 3)= 0.38621935810659565095543543710661065D-02 + px( 4)= 0.72233636458828631437195788212396685D-02 + px( 5)= 0.11694254587023519966120947669484770D-01 + px( 6)= 0.17339652834546167382576155326885531D-01 + px( 7)= 0.24244927054495278219738066751504960D-01 + px( 8)= 0.32519777490587042844934209629601974D-01 + px( 9)= 0.42303000131579013083618262933841541D-01 + px(10)= 0.53768369083049019518863347968872785D-01 + px(11)= 0.67131693007024900369022557250738329D-01 + px(12)= 0.82659034800333661590098251027713411D-01 + px(13)= 0.10067601794995718748717592252167950D+00 + px(14)= 0.12157811670283109994038391671204829D+00 + px(15)= 0.14584188025331271469532975454293184D+00 + px(16)= 0.17403718708094337911584686617663455D+00 + px(17)= 0.20684083223970121872293135877750441D+00 + px(18)= 0.24505195226053213810882336066380088D+00 + px(19)= 0.28960993053758356990356801037005909D+00 + px(20)= 0.34161548376800915367159074480126869D+00 + px(21)= 0.40235563464585336049157109682672131D+00 + px(22)= 0.47333327369938541669100296412222937D+00 + px(23)= 0.55630204160312520623134123151678925D+00 + px(24)= 0.65330734108265942786541154430622162D+00 + px(25)= 0.76673441875356785236785315155850152D+00 + px(26)= 0.89936464142274902321479563230619771D+00 + px(27)= 0.10544413316663370284059399235987637D+01 + px(28)= 0.12357468356208411453551987100600846D+01 + px(29)= 0.14476928958143963682846712437006035D+01 + px(30)= 0.16954269342764845689137922115223447D+01 + px(31)= 0.19849575816646326604339901913933094D+01 + px(32)= 0.23233038211583159605827592291675721D+01 + px(33)= 0.27186736189401563011050998256085975D+01 + px(34)= 0.31806801589007531007613766645355501D+01 + px(35)= 0.37206072396727045282926190726802162D+01 + px(36)= 0.43517407927213647930655693816847034D+01 + px(37)= 0.50897921727561576322594218577457258D+01 + px(38)= 0.59534532497729963738598299894774892D+01 + px(39)= 0.69651478712196520721782791126144665D+01 + px(40)= 0.81520877402662049285349974267562675D+01 + px(41)= 0.95478213967711421151781315680800985D+01 + px(42)= 0.11194623319536797531467272595329506D+02 + px(43)= 0.13147404152263011059682758275653972D+02 + px(44)= 0.15480595341123512509584325553694590D+02 + px(45)= 0.18301477075836510534702330167968349D+02 + px(46)= 0.21779637522565706043537218918652790D+02 + px(47)= 0.26226933347488070796043946213342574D+02 + px(48)= 0.32415929770269557720899017070958329D+02 + pw( 1)= 0.75999449282242525498888775227070420D-03 + pw( 2)= 0.17789442907920029044278935841370033D-02 + pw( 3)= 0.28233592185581488393945265516117315D-02 + pw( 4)= 0.39067912705549311303324714399927545D-02 + pw( 5)= 0.50457409167121635114256894882489535D-02 + pw( 6)= 0.62592215195201857161143217251824231D-02 + pw( 7)= 0.75695311456923674293727515403732119D-02 + pw( 8)= 0.90032044940945145010068536558863271D-02 + pw( 9)= 0.10592085506839418968886878624905230D-01 + pw(10)= 0.12374483314193827446279695880913988D-01 + pw(11)= 0.14396352965559528046529852650738501D-01 + pw(12)= 0.16712429540159363311802064676696312D-01 + pw(13)= 0.19387262265658075348794679025267961D-01 + pw(14)= 0.22496156590284673178523069618335189D-01 + pw(15)= 0.26126125032895265676414993536188666D-01 + pw(16)= 0.30377032282566849853328475248140570D-01 + pw(17)= 0.35363150724079551172748521328984452D-01 + pw(18)= 0.41215303456842402904128482278539693D-01 + pw(19)= 0.48083691328284411416338444917462566D-01 + pw(20)= 0.56141427938196717060539102662208221D-01 + pw(21)= 0.65588776105316823321919603829156998D-01 + pw(22)= 0.76658094133956830129535839504567249D-01 + pw(23)= 0.89619543419632664042116217247700679D-01 + pw(24)= 0.10478766192717970201342332439078370D+00 + pw(25)= 0.12252896123742271255427206161487662D+00 + pw(26)= 0.14327075828486752574672706211131092D+00 + pw(27)= 0.16751151313821774235835757278858330D+00 + pw(28)= 0.19583302163349966411912965960570620D+00 + pw(29)= 0.22891492049551029451481007469340092D+00 + pw(30)= 0.26755212283797206211340817029488241D+00 + pw(31)= 0.31267604416833112127254011348613261D+00 + pw(32)= 0.36538085205037423661137648346933768D+00 + pw(33)= 0.42695655668634922335366944118783047D+00 + pw(34)= 0.49893168999297620297105696955480497D+00 + pw(35)= 0.58312983017364951950857474996168896D+00 + pw(36)= 0.68174673032193870559875685439839375D+00 + pw(37)= 0.79745906091752647870294820280336694D+00 + pw(38)= 0.93358322488211662322908533451921218D+00 + pw(39)= 0.10943162513760950011430094815688344D+01 + pw(40)= 0.12851165678797307366335354072388698D+01 + pw(41)= 0.15133343910778260993594764587330245D+01 + pw(42)= 0.17893135308834332452949764559455513D+01 + pw(43)= 0.21284497101212601640922819767089646D+01 + pw(44)= 0.25553800629427160180922388763367066D+01 + pw(45)= 0.31135609716998794233456050246781466D+01 + pw(46)= 0.38911620012948672120134104867773046D+01 + pw(47)= 0.51125134158119019180657428081205730D+01 + pw(48)= 0.76826855076385203411408423144047970D+01 +endif +if(kn == 49) then + px( 1)= 0.28982169661609259925974979296199715D-03 + px( 2)= 0.15311074628945562334530690364685912D-02 + px( 3)= 0.37809891427061498988417968394892015D-02 + px( 4)= 0.70693075551075721251308809528074262D-02 + px( 5)= 0.11440114051043380640580809174873546D-01 + px( 6)= 0.16953896205505568651585337372141638D-01 + px( 7)= 0.23690155524198397880316141824091359D-01 + px( 8)= 0.31750757826730201808237900276821105D-01 + px( 9)= 0.41264185607363735111209847472395680D-01 + px(10)= 0.52390788296171869395123759706096667D-01 + px(11)= 0.65329088415812943007951010158851832D-01 + px(12)= 0.80323145971104464698417160245071838D-01 + px(13)= 0.97670923692398180646763841694521778D-01 + px(14)= 0.11773356193272553739176999855177642D+00 + px(15)= 0.14094549927124676527452589226795674D+00 + px(16)= 0.16782548211787615810300942831732404D+00 + px(17)= 0.19898867709139685594599685847717735D+00 + px(18)= 0.23516028351061490409755316887724481D+00 + px(19)= 0.27719118480638011914316315022611666D+00 + px(20)= 0.32607625000818758221137748238031822D+00 + px(21)= 0.38297591208948031388506643958671483D+00 + px(22)= 0.44924164526126338307000295982995931D+00 + px(23)= 0.52644597452520216232192817378809657D+00 + px(24)= 0.61641769981161764547143482902113440D+00 + px(25)= 0.72128311119138160917726410382682506D+00 + px(26)= 0.84351411110478227605310059636301640D+00 + px(27)= 0.98598434482611919288728229535556820D+00 + px(28)= 0.11520346774611530676609320335187635D+01 + px(29)= 0.13455496592189661370306885171963069D+01 + px(30)= 0.15710470166859281003411527674326210D+01 + px(31)= 0.18337827389082834345983893278310805D+01 + px(32)= 0.21398750605157030327208430120547271D+01 + px(33)= 0.24964516861943417767039670584804599D+01 + px(34)= 0.29118261213763337159423664681275174D+01 + px(35)= 0.33957112485642962686719816570055312D+01 + px(36)= 0.39594817726282308325228740791813504D+01 + px(37)= 0.46165026215025079679688539374253711D+01 + px(38)= 0.53825491684099448048962771604007379D+01 + px(39)= 0.62763596401674826758369077814162276D+01 + px(40)= 0.73203847840971806343130509762279760D+01 + px(41)= 0.85418435921307030922239809237588315D+01 + px(42)= 0.99742749209607261981040189301013931D+01 + px(43)= 0.11659933902343682924575082774292738D+02 + px(44)= 0.13653717512175632633212031175648009D+02 + px(45)= 0.16030079451319704520212844485652403D+02 + px(46)= 0.18896419514536338819621757969794054D+02 + px(47)= 0.22422681781072229358731374027395797D+02 + px(48)= 0.26921701181159762662101522272189023D+02 + px(49)= 0.33169237709310133472812163437610434D+02 + pw( 1)= 0.74423705557468321912970974468971512D-03 + pw( 2)= 0.17416585473194369192981311717828790D-02 + pw( 3)= 0.27630202767273380717880972293549723D-02 + pw( 4)= 0.38209254857123832686825908868042634D-02 + pw( 5)= 0.49307262850810818969143173604709699D-02 + pw( 6)= 0.61100327216620297105479543777143453D-02 + pw( 7)= 0.73793929142489472766538930130422414D-02 + pw( 8)= 0.87631380820224889920133086400922484D-02 + pw( 9)= 0.10290337290113942038466358200627605D-01 + pw(10)= 0.11995832288470777205381700382547014D-01 + pw(11)= 0.13921304955008894938778083468431898D-01 + pw(12)= 0.16116315842092009170407309532189491D-01 + pw(13)= 0.18639261057732707055179140040607199D-01 + pw(14)= 0.21558238862705300435120252889679016D-01 + pw(15)= 0.24951892641527327511221421263371520D-01 + pw(16)= 0.28910375260277562929269397616107752D-01 + pw(17)= 0.33536621862509534402237961505810400D-01 + pw(18)= 0.38948101356497020534071631819477111D-01 + pw(19)= 0.45279154574446597766189875771988559D-01 + pw(20)= 0.52683958194920854977905259654150493D-01 + pw(21)= 0.61340113116439272718820743217394536D-01 + pw(22)= 0.71452855489322219691816819653732876D-01 + pw(23)= 0.83259918269799951973839847297358939D-01 + pw(24)= 0.97037114546374637815371486877102608D-01 + pw(25)= 0.11310475960635753107742085931930780D+00 + pw(26)= 0.13183509368952202468340313538628788D+00 + pw(27)= 0.15366091498274177979540995552638069D+00 + pw(28)= 0.17908568983953526314195097583061572D+00 + pw(29)= 0.20869548441677582027963091462587493D+00 + pw(30)= 0.24317317246706239890716235938537564D+00 + pw(31)= 0.28331553768501980083448486564952605D+00 + pw(32)= 0.33005413638688135108429388605654924D+00 + pw(33)= 0.38448116653084696134266394851745611D+00 + pw(34)= 0.44788218293156059076871251650381853D+00 + pw(35)= 0.52177844169743715495966266074704591D+00 + pw(36)= 0.60798318304777180959943540325649013D+00 + pw(37)= 0.70867868475109945308012661652305988D+00 + pw(38)= 0.82652519802949145510247997374762292D+00 + pw(39)= 0.96482036576439726113478189858839344D+00 + pw(40)= 0.11277413304728328935012105843530012D+01 + pw(41)= 0.13207276384535346696169033036111123D+01 + pw(42)= 0.15511151994361137315814825047961978D+01 + pw(43)= 0.18292440906220512275179521250816934D+01 + pw(44)= 0.21705274773423284009902208179302011D+01 + pw(45)= 0.25996616983900981131162894691162348D+01 + pw(46)= 0.31602501106700710441030906881720498D+01 + pw(47)= 0.39408219589198821554199569326106068D+01 + pw(48)= 0.51667323905056736314902652284285907D+01 + pw(49)= 0.77475392524477876553320371372226975D+01 +endif +if(kn == 50) then + px( 1)= 0.28394351112953297748994559783102511D-03 + px( 2)= 0.14998911954688725774856401484033629D-02 + px( 3)= 0.37031733634109224832474380329849766D-02 + px( 4)= 0.69218127312350228406485064567062889D-02 + px( 5)= 0.11197087943784824700478952929697149D-01 + px( 6)= 0.16585564985245590131106268275448670D-01 + px( 7)= 0.23161412500517665834595664639261402D-01 + px( 8)= 0.31019403043677038055911010779170235D-01 + px( 9)= 0.40278717647655643037391307728658037D-01 + px(10)= 0.51087642190675989042234002017798487D-01 + px(11)= 0.63629213809160599368347959049041399D-01 + px(12)= 0.78127829475675329032296849094039453D-01 + px(13)= 0.94856775739499734452627500076424815D-01 + px(14)= 0.11414660148335876745918230546444594D+00 + px(15)= 0.13639426440178507981483041317230735D+00 + px(16)= 0.16207305783142124363194134470205347D+00 + px(17)= 0.19174346191330080395615598185222217D+00 + px(18)= 0.22606522557261662878988725676839652D+00 + px(19)= 0.26581112483702385355663297234020894D+00 + px(20)= 0.31188292620679003206265405632135533D+00 + px(21)= 0.36533011131301634013423764015010912D+00 + px(22)= 0.42737191711071555589396273204369958D+00 + px(23)= 0.49942324705020639374541286536052638D+00 + px(24)= 0.58312503667951418472684968751768941D+00 + px(25)= 0.68037972242344487980438298325566643D+00 + px(26)= 0.79339256658260940896953648094143840D+00 + px(27)= 0.92471973446804368904394005417532194D+00 + px(28)= 0.10773242035057135225415445929049064D+01 + px(29)= 0.12546408175407001854792111537007726D+01 + px(30)= 0.14606520990918797361653158063467201D+01 + px(31)= 0.16999768257560608388590285583482666D+01 + px(32)= 0.19779739082773843717702150700803880D+01 + px(33)= 0.23008648456607864889580088643288349D+01 + px(34)= 0.26758790850855905290663568936243531D+01 + px(35)= 0.31114281531421848676256035865658428D+01 + px(36)= 0.36173167280945029265548597355573382D+01 + px(37)= 0.42050023489136640359196188798116726D+01 + px(38)= 0.48879209793922610146974780441291671D+01 + px(39)= 0.56819045073984933891246336517741761D+01 + px(40)= 0.66057308705566818707545720531514149D+01 + px(41)= 0.76818723662188017987959388336177329D+01 + px(42)= 0.89375516656298253147474053069187414D+01 + px(43)= 0.10406296477480503239729452647809424D+02 + px(44)= 0.12130343567869783578305872825995901D+02 + px(45)= 0.14164579787764828196349642136898031D+02 + px(46)= 0.16583487005376427804066458589286026D+02 + px(47)= 0.19494592188944424125485472332517264D+02 + px(48)= 0.23068202782732093092557399540639023D+02 + px(49)= 0.27618134557703623179690912010041751D+02 + px(50)= 0.33923321036009768333728618541703920D+02 + pw( 1)= 0.72912399684308635017096115410335056D-03 + pw( 2)= 0.17059216228519449314704147393870548D-02 + pw( 3)= 0.27052579385463033654518158126252911D-02 + pw( 4)= 0.37388707495255014610097719618737042D-02 + pw( 5)= 0.48210691945239681765906911252349642D-02 + pw( 6)= 0.59681965658636724574936823069437344D-02 + pw( 7)= 0.71992306962236770905536136519431856D-02 + pw( 8)= 0.85365357689533034566484478187242343D-02 + pw( 9)= 0.10006713677868850439161417859632759D-01 + pw(10)= 0.11641531351319812896113767691847514D-01 + pw(11)= 0.13478885680498147705267761047018866D-01 + pw(12)= 0.15563753671054168683182913698539199D-01 + pw(13)= 0.17949077884564351780048811341841315D-01 + pw(14)= 0.20696568103487745734831198958981091D-01 + pw(15)= 0.23877459843864173187990416648243479D-01 + pw(16)= 0.27573340091582106763900700105376174D-01 + pw(17)= 0.31877198204074685512995246247087960D-01 + pw(18)= 0.36894860399462428730109186000241340D-01 + pw(19)= 0.42746921785947037355880124311792745D-01 + pw(20)= 0.49571228532988571977201451960147272D-01 + pw(21)= 0.57525917359503163899312028414966321D-01 + pw(22)= 0.66793006913272852902897497204087823D-01 + pw(23)= 0.77582552855288975335680322806095329D-01 + pw(24)= 0.90137412508095941124740779645748160D-01 + pw(25)= 0.10473870396294021438411474581832967D+00 + pw(26)= 0.12171208319009365070220694175921390D+00 + pw(27)= 0.14143500171709279543618029697282644D+00 + pw(28)= 0.16434515151851001863886624876712371D+00 + pw(29)= 0.19095035999365251682074028234920175D+00 + pw(30)= 0.22184027575178191861381490980142531D+00 + pw(31)= 0.25770029856713080526288855613179455D+00 + pw(32)= 0.29932837398793730342885176518781178D+00 + pw(33)= 0.34765552544247195237609202772613096D+00 + pw(34)= 0.40377138372726537143521552364635440D+00 + pw(35)= 0.46895657668751681273934682099459395D+00 + pw(36)= 0.54472479692767961940426388856919793D+00 + pw(37)= 0.63287890678130081055700714208328112D+00 + pw(38)= 0.73558798250487807944749198073740843D+00 + pw(39)= 0.85549650522124124655227075039375323D+00 + pw(40)= 0.99588443073362582872294183483256617D+00 + pw(41)= 0.11609105355219986209989146786182259D+01 + pw(42)= 0.13559974384949635725484334023159422D+01 + pw(43)= 0.15884690610914334611993355370110858D+01 + pw(44)= 0.18686642910659719608475187905997900D+01 + pw(45)= 0.22120162660997500537540215058314098D+01 + pw(46)= 0.26432826564193229543032457728289448D+01 + pw(47)= 0.32062148179068205963629824250927723D+01 + pw(48)= 0.39897000025706165355011208836568943D+01 + pw(49)= 0.52201085267504392582124308457577559D+01 + pw(50)= 0.78114350904888610285435916161990246D+01 +endif +if(kn == 51) then + px( 1)= 0.27830048331694745146626670725003848D-03 + px( 2)= 0.14699332467589495012214296420135614D-02 + px( 3)= 0.36285373076700958382889506232860957D-02 + px( 4)= 0.67804640421568805932750563900953938D-02 + px( 5)= 0.10964449641475153555650317586921338D-01 + px( 6)= 0.16233474841656748209131399424045480D-01 + px( 7)= 0.22656849606742837535759660703189982D-01 + px( 8)= 0.30322907994226545082257492807954794D-01 + px( 9)= 0.39342419929775285730416663052397862D-01 + px(10)= 0.49852801135275356633188650023787799D-01 + px(11)= 0.62023178388698049878991488778075058D-01 + px(12)= 0.76060329272036397821127225875497380D-01 + px(13)= 0.92215469045402426815676149451909456D-01 + px(14)= 0.11079181963288772179472869891025245D+00 + px(15)= 0.13215289192145648970655359123013550D+00 + px(16)= 0.15673146375949390138182262372756365D+00 + px(17)= 0.18503934411172685168403222212810987D+00 + px(18)= 0.21767815425667758980667415659377157D+00 + px(19)= 0.25535148902919646272821292167480645D+00 + px(20)= 0.29887891101338956958304222194317216D+00 + px(21)= 0.34921226930606420176060402127989888D+00 + px(22)= 0.40745483836472093899131085922052085D+00 + px(23)= 0.47488376920818361679977841816170633D+00 + px(24)= 0.55297635865326739663234570169538373D+00 + px(25)= 0.64344068535303042620990947995897496D+00 + px(26)= 0.74825123790612149826635700308884120D+00 + px(27)= 0.86969026995696913003053787424248180D+00 + px(28)= 0.10103957605313364799613553988071921D+01 + px(29)= 0.11734170393027801618266450011528511D+01 + px(30)= 0.13622793663980858168640978903164547D+01 + px(31)= 0.15810590528845442660473438585783950D+01 + px(32)= 0.18344711002533369402909515294518176D+01 + px(33)= 0.21279718700902070387593931590109607D+01 + px(34)= 0.24678800386227544163678584032999521D+01 + px(35)= 0.28615201544104645116093866929518600D+01 + px(36)= 0.33173946748043709269495501699195293D+01 + px(37)= 0.38453926882082167185869634333011899D+01 + px(38)= 0.44570470958112522799784043488271277D+01 + px(39)= 0.51658576054751924442339769813379872D+01 + px(40)= 0.59877058293844537789530450556075720D+01 + px(41)= 0.69414034941300311714983901016833412D+01 + px(42)= 0.80494397879808445218602940428136405D+01 + px(43)= 0.93390380570666107887290025527994856D+01 + px(44)= 0.10843713859839632308899371210210092D+02 + px(45)= 0.12605686843057786008572981971452711D+02 + px(46)= 0.14679837296994134584444426535907765D+02 + px(47)= 0.17140680920657630892046419784075754D+02 + px(48)= 0.20095878907672599896988947474029035D+02 + px(49)= 0.23716108785958653467610019526488283D+02 + px(50)= 0.28316168939449414612178766745269557D+02 + px(51)= 0.34678144812502309409979036489309933D+02 + pw( 1)= 0.71461661125073618577714141168068574D-03 + pw( 2)= 0.16716385878603382275615092392545966D-02 + pw( 3)= 0.26499087137809206936489894716951881D-02 + pw( 4)= 0.36603737280291944817450300201489801D-02 + pw( 5)= 0.47163932617577667464231453856574222D-02 + pw( 6)= 0.58331641163766286245746932185477944D-02 + pw( 7)= 0.70282510906775725331369422805168084D-02 + pw( 8)= 0.83222573157223186581625603590243717D-02 + pw( 9)= 0.97395848045625290059070106665033066D-02 + pw(10)= 0.11309266338983814689829245251610443D-01 + pw(11)= 0.13065837815189701273276681449362080D-01 + pw(12)= 0.15050206824622500427636027066505286D-01 + pw(13)= 0.17310471604431563174843365788224820D-01 + pw(14)= 0.19902665286202140668402395991957144D-01 + pw(15)= 0.22891446750016473901306940204721219D-01 + pw(16)= 0.26350819616702910168573443501795867D-01 + pw(17)= 0.30365009714194435950502154476602598D-01 + pw(18)= 0.35029644330122852911457703936925401D-01 + pw(19)= 0.40453348181743022646044235704229779D-01 + pw(20)= 0.46759819486523129479791042090399526D-01 + pw(21)= 0.54090403179921190709407057449782309D-01 + pw(22)= 0.62607156912964304588823654347725686D-01 + pw(23)= 0.72496411785130069944054386511386143D-01 + pw(24)= 0.83972854815470973907373202544753383D-01 + pw(25)= 0.97284192841846608857881098157165865D-01 + pw(26)= 0.11271649103721755058596361082284142D+00 + pw(27)= 0.13060031219247571052268358514865611D+00 + pw(28)= 0.15131781797811823631871970343701060D+00 + pw(29)= 0.17531103556726408969705872673699634D+00 + pw(30)= 0.20309154904134215752202092778295388D+00 + pw(31)= 0.23525195397891395078569762059044835D+00 + pw(32)= 0.27247952854009146289412622775860388D+00 + pw(33)= 0.31557274490479512420214647476118810D+00 + pw(34)= 0.36546150205090068791751615370418030D+00 + pw(35)= 0.42323235431744446559543345018447882D+00 + pw(36)= 0.49016062156217930068156275755723687D+00 + pw(37)= 0.56775223265564612092706940030068368D+00 + pw(38)= 0.65779970939872780221136201150769308D+00 + pw(39)= 0.76245925865498574135551621597788489D+00 + pw(40)= 0.88436027001142532199927186614862192D+00 + pw(41)= 0.10267660753266581074365019471728486D+01 + pw(42)= 0.11938185471963720076518527335025432D+01 + pw(43)= 0.13909252171844327541246452342529830D+01 + pw(44)= 0.16254001662180292675140925291992103D+01 + pw(45)= 0.19075834129441296076092154965785291D+01 + pw(46)= 0.22529303361525593567401491175421270D+01 + pw(47)= 0.26862617892200971765670039600369766D+01 + pw(48)= 0.32514778994133779406256785815727141D+01 + pw(49)= 0.40378220858095016239167419409137174D+01 + pw(50)= 0.52726701882528895779137444037618021D+01 + pw(51)= 0.78744042074131644922750515876542347D+01 +endif +if(kn == 52) then + px( 1)= 0.27287878963778979318704099764976979D-03 + px( 2)= 0.14411589143135589309910830756029883D-02 + px( 3)= 0.35568888680936775757132750276695479D-02 + px( 4)= 0.66448808369926751816655531142947976D-02 + px( 5)= 0.10741535203421646235306609286779033D-01 + px( 6)= 0.15896548124765745985567116234974430D-01 + px( 7)= 0.22174792928607557748215031029525379D-01 + px( 8)= 0.29658745389012720604963104746455902D-01 + px( 9)= 0.38451550078535438960644609304592703D-01 + px(10)= 0.48680802154773981897502480211717432D-01 + px(11)= 0.60503099784552369897160088385145451D-01 + px(12)= 0.74109391214600302911242894598461603D-01 + px(13)= 0.89731100249233358933686871661989877D-01 + px(14)= 0.10764697753584156876969725074282997D+00 + px(15)= 0.12819061301398605430851761152295284D+00 + px(16)= 0.15175857698395808054368327545045674D+00 + px(17)= 0.17881924018609623916197948354619579D+00 + px(18)= 0.20992244196465392763256291141326856D+00 + px(19)= 0.24571029757551810503161176844891960D+00 + px(20)= 0.28692952833595684927481466342699473D+00 + px(21)= 0.33444574647578471028054731992057917D+00 + px(22)= 0.38926013793139080772351691924522347D+00 + px(23)= 0.45252898276941636881394275842894023D+00 + px(24)= 0.52558645699160000314972991962565110D+00 + px(25)= 0.60997118585865879765197048254341330D+00 + px(26)= 0.70745707342364955349644867464416164D+00 + px(27)= 0.82008901627275511415852311384357046D+00 + px(28)= 0.95022422122473475017786104441461093D+00 + px(29)= 0.11005799888944176702824780847485411D+01 + px(30)= 0.12742890037078181125625630187828027D+01 + px(31)= 0.14749633979156047676958663456164965D+01 + px(32)= 0.17067691516542664863528589999065341D+01 + px(33)= 0.19745127831074488312044849464771686D+01 + px(34)= 0.22837428183447160072986023000468876D+01 + px(35)= 0.26408692804764055817543358579323038D+01 + px(36)= 0.30533055131576702465080896333143573D+01 + px(37)= 0.35296382295469342100204889068765045D+01 + px(38)= 0.40798340368281393790208425276818839D+01 + px(39)= 0.47154942919829600315975357986809138D+01 + px(40)= 0.54501757769677042873771351203731296D+01 + px(41)= 0.62998036929575447450649046764701569D+01 + px(42)= 0.72832182898545812637798931719272845D+01 + px(43)= 0.84229216047310103819935078476256484D+01 + px(44)= 0.97461351851146406033204591955381092D+01 + px(45)= 0.11286361832018459964955857919026558D+02 + px(46)= 0.13085805530780086242892657357654051D+02 + px(47)= 0.15199343565107891341827069072222156D+02 + px(48)= 0.17701530875675334700619075035054618D+02 + px(49)= 0.20700169445006682515691632852528091D+02 + px(50)= 0.24366312928914848142376439425799257D+02 + px(51)= 0.29015743374923417716784066471530138D+02 + px(52)= 0.35433676271462821390664880501380081D+02 + pw( 1)= 0.70067921248512656450750262380470989D-03 + pw( 2)= 0.16387221028734144344416061643064582D-02 + pw( 3)= 0.25968227302892772688118502909161922D-02 + pw( 4)= 0.35852033221618217540588075672036645D-02 + pw( 5)= 0.46163571334057459414651619546435755D-02 + pw( 6)= 0.57044407177542779516540853577376776D-02 + pw( 7)= 0.68657439258466701284209801337979038D-02 + pw( 8)= 0.81192888642852760066695573485214022D-02 + pw( 9)= 0.94875105621027290016976460295608743D-02 + pw(10)= 0.10997004127430955685924352255220372D-01 + pw(11)= 0.12679313613778005716132640549939369D-01 + pw(12)= 0.14571725519850453422967526162847044D-01 + pw(13)= 0.16718025683719221072849622604121732D-01 + pw(14)= 0.19169191632602931509237238888017571D-01 + pw(15)= 0.21984027773632325146538486813698425D-01 + pw(16)= 0.25229801526425832500489421276402268D-01 + pw(17)= 0.28982985579859711073310312330962579D-01 + pw(18)= 0.33330232727989722055601631534296414D-01 + pw(19)= 0.38369695026986305327273292677722862D-01 + pw(20)= 0.44212758195453754037273486501473222D-01 + pw(21)= 0.50986218113528798819802588773272107D-01 + pw(22)= 0.58834899219859082186151202608595314D-01 + pw(23)= 0.67924711805560266218170048520749527D-01 + pw(24)= 0.78446161793212774417972991998090425D-01 + pw(25)= 0.90618353089338093281832862102708959D-01 + pw(26)= 0.10469355154807580861182066828267783D+00 + pw(27)= 0.12096240801949377693281668581524884D+00 + pw(28)= 0.13975996681192581185584791335037507D+00 + pw(29)= 0.16147261854647111561641477797041919D+00 + pw(30)= 0.18654619771836799217214159212739546D+00 + pw(31)= 0.21549548172910307786548214775429476D+00 + pw(32)= 0.24891542853693721010435838752243091D+00 + pw(33)= 0.28749460731095023155833868404490821D+00 + pw(34)= 0.33203145035164386811290281590906319D+00 + pw(35)= 0.38345421620808062938143753495150014D+00 + pw(36)= 0.44284595340052667020006768746408026D+00 + pw(37)= 0.51147637334768443075077758376770258D+00 + pw(38)= 0.59084351705391666304914198628651331D+00 + pw(39)= 0.68272966822652633355509618616923460D+00 + pw(40)= 0.78927854268555272214405511831212374D+00 + pw(41)= 0.91310513570381533776392939171880649D+00 + pw(42)= 0.10574572006171471782817226512789618D+01 + pw(43)= 0.12264611065174215544931924431769419D+01 + pw(44)= 0.14255110268002246494143307711209540D+01 + pw(45)= 0.16619131890762190850734830449377183D+01 + pw(46)= 0.19460108584649183568553387986228161D+01 + pw(47)= 0.22932836878593317486878267058034769D+01 + pw(48)= 0.27286173208169332405445956554845995D+01 + pw(49)= 0.32960612083291505532244371231002347D+01 + pw(50)= 0.40852129572893008083223241182607456D+01 + pw(51)= 0.53244443718708726283429813785502501D+01 + pw(52)= 0.79364763274033460235909669710486787D+01 +endif +if(kn == 53) then + px( 1)= 0.26766570337770727528170071246520492D-03 + px( 2)= 0.14134994888717316196512141359660423D-02 + px( 3)= 0.34880516519107403970758408932728686D-02 + px( 4)= 0.65147144546990128201893001657184119D-02 + px( 5)= 0.10527738385818098548917273507385571D-01 + px( 6)= 0.15573804518451021912970981414950172D-01 + px( 7)= 0.21713726252830234938777297490158659D-01 + px( 8)= 0.29024636763636674603190500526652300D-01 + px( 9)= 0.37602750842526664184644761079770188D-01 + px(10)= 0.47566769031392334409003814047662918D-01 + px(11)= 0.59061976687480610814082596844534784D-01 + px(12)= 0.72265065129161553717681820608077034D-01 + px(13)= 0.87389667514901234333023139285043850D-01 + px(14)= 0.10469256811716735138855696367380459D+00 + px(15)= 0.12448052659100388617653071564391793D+00 + px(16)= 0.14711767656647036665309334227739246D+00 + px(17)= 0.17303351979129595018460716414084922D+00 + px(18)= 0.20273163525968242510069482779745799D+00 + px(19)= 0.23679933260205764383581082765297944D+00 + px(20)= 0.27591857089423386003627193488624766D+00 + px(21)= 0.32087851933017891678519245554349762D+00 + px(22)= 0.37259015544714682612264560035248998D+00 + px(23)= 0.43210329558213462840685308411772863D+00 + px(24)= 0.50062645115169427006331566594320087D+00 + px(25)= 0.57954991863929172590111454477122563D+00 + px(26)= 0.67047254853929904297053555520671948D+00 + px(27)= 0.77523270078396767247020251586833006D+00 + px(28)= 0.89594398099521297070364552635590329D+00 + px(29)= 0.10350364637633983911134434941276054D+01 + px(30)= 0.11953042494029295972874450876803876D+01 + px(31)= 0.13799603766985451495079757379930050D+01 + px(32)= 0.15927003388200029769046949954648636D+01 + px(33)= 0.18377757429440148519213952318668567D+01 + px(34)= 0.21200800469036983170430724963400520D+01 + px(35)= 0.24452488452376621255574183769872919D+01 + px(36)= 0.28197779343346462148849901776278631D+01 + px(37)= 0.32511634739679835132078134711319616D+01 + px(38)= 0.37480701572008120067601493824866792D+01 + px(39)= 0.43205356864545827578744429205984365D+01 + px(40)= 0.49802234963192223418120333779580923D+01 + px(41)= 0.57407413466949074338019856932754499D+01 + px(42)= 0.66180524896462577279982649502155498D+01 + px(43)= 0.76310210235736203134564527928377033D+01 + px(44)= 0.88021583399572758584848067436385530D+01 + px(45)= 0.10158682176971824690390966321630474D+02 + px(46)= 0.11734082305033337839638441179290729D+02 + px(47)= 0.13570548670500422261838718291627263D+02 + px(48)= 0.15722959067717953266995872699981198D+02 + px(49)= 0.18265912835593835999181449267925846D+02 + px(50)= 0.21307358846087134932506642776378880D+02 + px(51)= 0.25018732302848078195180809407087663D+02 + px(52)= 0.29716799310767845023829953456809187D+02 + px(53)= 0.36189883296712405656605145166567620D+02 + pw( 1)= 0.68727895311546056672417632556060663D-03 + pw( 2)= 0.16070919274300689247610198780862261D-02 + pw( 3)= 0.25458627429238886643700982329323737D-02 + pw( 4)= 0.35131488282427671750845547282299960D-02 + pw( 5)= 0.45206512132016346669133873690489887D-02 + pw( 6)= 0.55815803493477684696883612785461534D-02 + pw( 7)= 0.67110727810055987477008256080221629D-02 + pw( 8)= 0.79267274337364857488746447630284210D-02 + pw( 9)= 0.92492158987355566092832344937099689D-02 + pw(10)= 0.10702954335651349887056984538693118D-01 + pw(11)= 0.12316817159979657159477748940914048D-01 + pw(12)= 0.14124861608422353998695630535116428D-01 + pw(13)= 0.16167026688242225669124227980203198D-01 + pw(14)= 0.18489777850309322975434696546792003D-01 + pw(15)= 0.21146695079329887928180449259296579D-01 + pw(16)= 0.24199043898015304232347701717858443D-01 + pw(17)= 0.27716412274949780256699383093693372D-01 + pw(18)= 0.31777522531875989191876025347365721D-01 + pw(19)= 0.36471323604726302437811675512366006D-01 + pw(20)= 0.41898438814168148316223242028468085D-01 + pw(21)= 0.48173004736710585958446082731843080D-01 + pw(22)= 0.55424906910282799399495351801946886D-01 + pw(23)= 0.63802408146079784845031445360740918D-01 + pw(24)= 0.73475174123291445512755683573657914D-01 + pw(25)= 0.84637721415891914195614185274215305D-01 + pw(26)= 0.97513337772431529356043555523731762D-01 + pw(27)= 0.11235854926743224508497153739859653D+00 + pw(28)= 0.12946823338997306958433565807904794D+00 + pw(29)= 0.14918150326264123120255847449750618D+00 + pw(30)= 0.17188851951158054500285261747356087D+00 + pw(31)= 0.19803842749441616169319688491554627D+00 + pw(32)= 0.22814867480320173542334621460640035D+00 + pw(33)= 0.26281604587933027973782593098613157D+00 + pw(34)= 0.30272987012943215567370818850475517D+00 + pw(35)= 0.34868803701893003485156344297085159D+00 + pw(36)= 0.40161671758475143724478782201274920D+00 + pw(37)= 0.46259509691660897469854774936577795D+00 + pw(38)= 0.53288704858998218510998379047232847D+00 + pw(39)= 0.61398266720080789199956346065784397D+00 + pw(40)= 0.70765415497287477021582999321525042D+00 + pw(41)= 0.81603315079286754570846341816085723D+00 + pw(42)= 0.94172096259934976721215747985393842D+00 + pw(43)= 0.10879507869255648374416119995114567D+01 + pw(44)= 0.12588348263164235013060998002290594D+01 + pw(45)= 0.14597555182997943253538598754860232D+01 + pw(46)= 0.16980130743144824162948847512548702D+01 + pw(47)= 0.19839559340047588104729344306844787D+01 + pw(48)= 0.23330898446405588037915551380406672D+01 + pw(49)= 0.27703666290115585782188239169095417D+01 + pw(50)= 0.33399854209196729287026099030910255D+01 + pw(51)= 0.41318959182038353492816091595673578D+01 + pw(52)= 0.53754564214506164841297568531690314D+01 + pw(53)= 0.79976793171584998097855897868836215D+01 +endif +if(kn == 54) then + px( 1)= 0.26264930117348429139290996151281804D-03 + px( 2)= 0.13868906738912660879236605780496720D-02 + px( 3)= 0.34218609389365101881877496771598208D-02 + px( 4)= 0.63896402666404078488978957960735342D-02 + px( 5)= 0.10322496845387460828947292469844905D-01 + px( 6)= 0.15264338738352449380895430335814458D-01 + px( 7)= 0.21272256455056810596723516130341971D-01 + px( 8)= 0.28418500058997028295178690126035519D-01 + px( 9)= 0.36792971921502915811121828397263107D-01 + px(10)= 0.46506297013822420204383294802715589D-01 + px(11)= 0.57693520466708188814704660142883650D-01 + px(12)= 0.70518461365129452425223403139712676D-01 + px(13)= 0.85178722402024364366080809274553805D-01 + px(14)= 0.10191132396313916367597436543864246D+00 + px(15)= 0.12099891148146303511007328280200670D+00 + px(16)= 0.14277649208960370764126858545635037D+00 + px(17)= 0.16763870128060588007507633768412165D+00 + px(18)= 0.19604767989373694212165892917330544D+00 + px(19)= 0.22854173844903711798992185084177464D+00 + px(20)= 0.26574507414867220998786541508194950D+00 + px(21)= 0.30837886575038189865518112878881341D+00 + px(22)= 0.35727409810156122358477142059464743D+00 + px(23)= 0.41338647141242328011007195602712556D+00 + px(24)= 0.47781374735909608618161123128437916D+00 + px(25)= 0.55181589013188334829869817262449587D+00 + px(26)= 0.63683838468354892203676383796234712D+00 + px(27)= 0.73453915990617498625433330235628987D+00 + px(28)= 0.84681961134902351472977037719948889D+00 + px(29)= 0.97586030619425103922072317930864173D+00 + px(30)= 0.11241620640819289778534857523108689D+01 + px(31)= 0.12945932455227004001780815602771556D+01 + px(32)= 0.14904442534576446040956072788546865D+01 + px(33)= 0.17154904765677712078499252639933687D+01 + px(34)= 0.19740651959466252026766816200100743D+01 + px(35)= 0.22711443710983615908928413364407126D+01 + px(36)= 0.26124457644974208978288838251819154D+01 + px(37)= 0.30045456290378285221329470833486270D+01 + px(38)= 0.34550172827009750892780305838794194D+01 + px(39)= 0.39725975078115629937328151923773107D+01 + px(40)= 0.45673891239659119753724152909459628D+01 + px(41)= 0.52511117619579365605498903719507942D+01 + px(42)= 0.60374185967841005869825918871706947D+01 + px(43)= 0.69423059418930191122711841795147301D+01 + px(44)= 0.79846576055279832356713682899398131D+01 + px(45)= 0.91869913294368492995853064476380612D+01 + px(46)= 0.10576519438659101773107722114056823D+02 + px(47)= 0.12186718685254033866362337537527451D+02 + px(48)= 0.14059766727898227283287996455084238D+02 + px(49)= 0.16250545334714196738451761057164415D+02 + px(50)= 0.18833703150856317701270861469808901D+02 + px(51)= 0.21917341606064316865260155485126217D+02 + px(52)= 0.25673282302478359060862571209297198D+02 + px(53)= 0.30419275213506007788817412536110986D+02 + px(54)= 0.36946729449797287732704130194371723D+02 + pw( 1)= 0.67438506760335604670255898997892204D-03 + pw( 2)= 0.15766730795758068292367252096595575D-02 + pw( 3)= 0.24969010052497357618191188751223522D-02 + pw( 4)= 0.34440151585437669876044394575278031D-02 + pw( 5)= 0.44289906416819842278218395665719176D-02 + pw( 6)= 0.54641754472827296169646638159218540D-02 + pw( 7)= 0.65636602656678083773300472655876417D-02 + pw( 8)= 0.77437596216345299454674361542457447D-02 + pw( 9)= 0.90235600454359303552602178356724465D-02 + pw(10)= 0.10425525098028000202217706397525802D-01 + pw(11)= 0.11976141415278903324803398081755125D-01 + pw(12)= 0.13706580129463991816022845593243222D-01 + pw(13)= 0.15653341765432086164765180782288484D-01 + pw(14)= 0.17858856637993859978963097726835278D-01 + pw(15)= 0.20372031993532375434976430093094102D-01 + pw(16)= 0.23248770947110602095264163700609205D-01 + pw(17)= 0.26552527146765384924391807859062013D-01 + pw(18)= 0.30354987338193146683949319082938468D-01 + pw(19)= 0.34736978572221172926784909895879575D-01 + pw(20)= 0.39789676392153673565656352708108709D-01 + pw(21)= 0.45616156632160377923066059864692116D-01 + pw(22)= 0.52333303181858472447283586460111768D-01 + pw(23)= 0.60074068936523280823710913114576385D-01 + pw(24)= 0.68990089360969502777474092821797731D-01 + pw(25)= 0.79254662793149992222714102849062817D-01 + pw(26)= 0.91066132125499050058268572358948574D-01 + pw(27)= 0.10465172413589772775444532528313942D+00 + pw(28)= 0.12027192389960804141840562034167157D+00 + pw(29)= 0.13822548329913942877038198233392259D+00 + pw(30)= 0.15885518710020893698686679081438881D+00 + pw(31)= 0.18255453080015552034691247423767820D+00 + pw(32)= 0.20977550590982571093431208426850228D+00 + pw(33)= 0.24103774649612254567565390373246475D+00 + pw(34)= 0.27693937431815337409878200394680401D+00 + pw(35)= 0.31817000170668241587144774061721975D+00 + pw(36)= 0.36552653149467718606906358503261169D+00 + pw(37)= 0.41993266332639492829188554992634159D+00 + pw(38)= 0.48246342603227571148223509952223773D+00 + pw(39)= 0.55437668888757955166034857478955110D+00 + pw(40)= 0.63715459820483206603034925190427611D+00 + pw(41)= 0.73255947628580611457525194256778522D+00 + pw(42)= 0.84271132620425470830602701627255624D+00 + pw(43)= 0.97019847802255398271823872088809738D+00 + pw(44)= 0.11182405648635278467279403261340421D+01 + pw(45)= 0.12909368955309583178279432436240182D+01 + pw(46)= 0.14936596913948188663729129981816013D+01 + pw(47)= 0.17337048448489904526596452227676677D+01 + pw(48)= 0.20214277264670434357889736680445585D+01 + pw(49)= 0.23723618064324051170579931680799604D+01 + pw(50)= 0.28115262822357354520836275890201874D+01 + pw(51)= 0.33832701615264074051804266553052254D+01 + pw(52)= 0.41778930411553237976334045242367218D+01 + pw(53)= 0.54257303561036728308093456793849096D+01 + pw(54)= 0.80580396891998440461864612470334815D+01 +endif +if(kn == 55) then + px( 1)= 0.25781894164194012434309377930977151D-03 + px( 2)= 0.13612751031594850206775668181403743D-02 + px( 3)= 0.33581698458852455723214904600493989D-02 + px( 4)= 0.62693690413467873882583513720977616D-02 + px( 5)= 0.10125310136665687971355854015542415D-01 + px( 6)= 0.14967346351298576942771412456996075D-01 + px( 7)= 0.20849147888131304033053596047200679D-01 + px( 8)= 0.27838492572990899694403556585128016D-01 + px( 9)= 0.36019520279673976527480271121267550D-01 + px(10)= 0.45495507499439646885138144349015201D-01 + px(11)= 0.56392208694907062019327988961512485D-01 + px(12)= 0.68861794220674272839206633722494427D-01 + px(13)= 0.83087389784370585294971502938757785D-01 + px(14)= 0.99288194535557030324060332733237417D-01 + px(15)= 0.11772513411433556959001042015562167D+00 + px(16)= 0.13870700468575135671370372276360989D+00 + px(17)= 0.16259709487800222734179448321576442D+00 + px(18)= 0.18982033572772041366926851284912359D+00 + px(19)= 0.22087111215348614429940107012361562D+00 + px(20)= 0.25632195206452177680013231303976781D+00 + px(21)= 0.29683337125272468206259039405023979D+00 + px(22)= 0.34316518501958180617486163242847902D+00 + px(23)= 0.39618960592837583486273888758703435D+00 + px(24)= 0.45690644457913117767731277056887464D+00 + px(25)= 0.52646073119585672648955063711186170D+00 + px(26)= 0.60616309005755762712921556869139875D+00 + px(27)= 0.69751323092341496562260118366147217D+00 + px(28)= 0.80222697243582393516449526286850739D+00 + px(29)= 0.92226728168578278287496843309443837D+00 + px(30)= 0.10598799021605760545446954338407357D+01 + px(31)= 0.12176342516911872309482334694729854D+01 + px(32)= 0.13984704081792695653834769855727069D+01 + px(33)= 0.16057531729310064020228982054108760D+01 + px(34)= 0.18433344234846220454903450952844757D+01 + px(35)= 0.21156252611660201772094982169604363D+01 + px(36)= 0.24276798551755632926179359953601148D+01 + px(37)= 0.27852934330452205314765679671405731D+01 + px(38)= 0.31951176405057028717258322130425570D+01 + px(39)= 0.36647976062397870566073557102018905D+01 + px(40)= 0.42031366779907243753664331178612278D+01 + px(41)= 0.48202972331234718143106561639280279D+01 + px(42)= 0.55280496786915488679928146699422870D+01 + px(43)= 0.63400875319084531166198158934082316D+01 + px(44)= 0.72724356766190935880980645516629891D+01 + px(45)= 0.83439939737765991151533907320407321D+01 + px(46)= 0.95772839454414406682247983833388953D+01 + px(47)= 0.10999511253155353900722704602341888D+02 + px(48)= 0.12644139881765944224874013113714956D+02 + px(49)= 0.14553337047736742937007375682160668D+02 + px(50)= 0.16781991883653111960448246164544552D+02 + px(51)= 0.19404807001908978674636447631480543D+02 + px(52)= 0.22530041649957455873120088574563768D+02 + px(53)= 0.26329908156600549662558562174411129D+02 + px(54)= 0.31123139662609753954221361691834122D+02 + px(55)= 0.37704208661211514525413812039019937D+02 + pw( 1)= 0.66197010017150176035001943503791612D-03 + pw( 2)= 0.15473986838973462236228078334406893D-02 + pw( 3)= 0.24498237063923531340555158367628164D-02 + pw( 4)= 0.33776287859319922812719456819175282D-02 + pw( 5)= 0.43411225775038708068974684768107364D-02 + pw( 6)= 0.53518651954260805354105863317050609D-02 + pw( 7)= 0.64229967432330231179415055321225939D-02 + pw( 8)= 0.75696698103592431759756449888418760D-02 + pw( 9)= 0.88095427367590507023412664390713554D-02 + pw(10)= 0.10163325161348951569439668089332658D-01 + pw(11)= 0.11655363240567140813674097334911855D-01 + pw(12)= 0.13314243348515058027067949270881268D-01 + pw(13)= 0.15173386635166955733192695121243376D-01 + pw(14)= 0.17271608028279645041513857207256865D-01 + pw(15)= 0.19653626901657341159725115198646941D-01 + pw(16)= 0.22370543498465587594423534082963911D-01 + pw(17)= 0.25480329057857957803698239091828550D-01 + pw(18)= 0.29048405984491612079844523910580431D-01 + pw(19)= 0.33148404854392620654285014862228099D-01 + pw(20)= 0.37863173170688566168679168381831541D-01 + pw(21)= 0.43286083546221674152033551748574709D-01 + pw(22)= 0.49522660152701866580552473522264330D-01 + pw(23)= 0.56692523854953065740425656620598762D-01 + pw(24)= 0.64931653102898776070657891627893174D-01 + pw(25)= 0.74394966967471873327666467149784361D-01 + pw(26)= 0.85259253137910454276787766290602372D-01 + pw(27)= 0.97726482395605256876361711073457773D-01 + pw(28)= 0.11202756962445724523518796251371951D+00 + pw(29)= 0.12842665975742201914453661139337216D+00 + pw(30)= 0.14722603673926230968313687924515801D+00 + pw(31)= 0.16877177710790063461389831494800560D+00 + pw(32)= 0.19346030041623174522759333491248744D+00 + pw(33)= 0.22174601068744206398277053899636196D+00 + pw(34)= 0.25415028231904386989193251104458640D+00 + pw(35)= 0.29127212894662705701804000206750184D+00 + pw(36)= 0.33380101778748297671825708760982179D+00 + pw(37)= 0.38253247505964083905642476699352139D+00 + pw(38)= 0.43838740189609651989190850364739389D+00 + pw(39)= 0.50243643541811971842045067743697266D+00 + pw(40)= 0.57593132895181935261643585397322053D+00 + pw(41)= 0.66034632680141149455034526996759720D+00 + pw(42)= 0.75743410948623930877746056875550536D+00 + pw(43)= 0.86930350487407763483137649785144665D+00 + pw(44)= 0.99853057127864209173395623916848905D+00 + pw(45)= 0.11483223403291556759761625679375804D+01 + pw(46)= 0.13227664361633706905774477225292859D+01 + pw(47)= 0.15272262858299299746420952422140745D+01 + pw(48)= 0.17689950292216856169402049903686857D+01 + pw(49)= 0.20584365667376688423615789562990280D+01 + pw(50)= 0.24111135465716154769717216299933567D+01 + pw(51)= 0.28521135641444982391089026540840319D+01 + pw(52)= 0.34259355468040848187361378226527105D+01 + pw(53)= 0.42232267265008794416077388243344182D+01 + pw(54)= 0.54752904412621487070917591943555763D+01 + pw(55)= 0.81175842524558588852537391429064074D+01 +endif +if(kn == 56) then + px( 1)= 0.25316296361047986979222704481695825D-03 + px( 2)= 0.13365901075979378376059799633963547D-02 + px( 3)= 0.32968187867088652972746128579161298D-02 + px( 4)= 0.61535889280423947494589016534393146D-02 + px( 5)= 0.99356438145931069871255487366443905D-02 + px( 6)= 0.14681977810167156277573501598213788D-01 + px( 7)= 0.20443111678193472909091734497338465D-01 + px( 8)= 0.27282717458052044439797443722558727D-01 + px( 9)= 0.35279661237909244138393562138044293D-01 + px(10)= 0.44530515113621224893066466054989604D-01 + px(11)= 0.55152581833421451048846493819641517D-01 + px(12)= 0.67287461982589797279652325494947566D-01 + px(13)= 0.81105172698463510127775615113455155D-01 + px(14)= 0.96808802192712951405093425956382214D-01 + px(15)= 0.11463966370426778674305108921559686D+00 + px(16)= 0.13488290707039924587391549901601733D+00 + px(17)= 0.15787356616502069558695735577341045D+00 + px(18)= 0.18400306948311470201893098688372094D+00 + px(19)= 0.21372631172205091315476055948506776D+00 + px(20)= 0.24756945964825417876144186941921348D+00 + px(21)= 0.28613872754179262008712244069824825D+00 + px(22)= 0.33013039508573925156714087895320715D+00 + px(23)= 0.38034235430595768392206349401494352D+00 + px(24)= 0.43768747182717257038476928803614522D+00 + px(25)= 0.50320905100551354990170632643254963D+00 + px(26)= 0.57809868566859133942855647933251305D+00 + px(27)= 0.66371681885395609603081291900385610D+00 + px(28)= 0.76161635782113158520389067408690172D+00 + px(29)= 0.87356975058892174416448481275277481D+00 + px(30)= 0.10015999992219932485646137689436440D+01 + px(31)= 0.11480161722880751230648197166792158D+01 + px(32)= 0.13154540866711925408169656894558134D+01 + px(33)= 0.15069229633419957830339656064577451D+01 + px(34)= 0.17258590324121035302982461790011709D+01 + px(35)= 0.19761872844199233411044622815807802D+01 + px(36)= 0.22623928590392104330658026022941003D+01 + px(37)= 0.25896039617088162412729887555211206D+01 + px(38)= 0.29636887519045083121995707405502705D+01 + px(39)= 0.33913694283389338916129064504517107D+01 + px(40)= 0.38803578612346352983211862453492671D+01 + px(41)= 0.44395187703404766581147350250334258D+01 + px(42)= 0.50790689079212912344545291525845662D+01 + px(43)= 0.58108244499131239038620507188960897D+01 + px(44)= 0.66485146168191270558987216454001789D+01 + px(45)= 0.76081888077726672589265324613407006D+01 + px(46)= 0.87087596930040374091019857045701857D+01 + px(47)= 0.99727503684543062151028368853863376D+01 + px(48)= 0.11427358864753300284154140246256655D+02 + px(49)= 0.13106036822460464759217266858454354D+02 + px(50)= 0.15050942893772414274408031537252686D+02 + px(51)= 0.17316976836533264840463009058352564D+02 + px(52)= 0.19978899582721245916009884455806908D+02 + px(53)= 0.23145132959066649402957025145184967D+02 + px(54)= 0.26988283646585129742188842153188599D+02 + px(55)= 0.31828066291287360025547176837209814D+02 + px(56)= 0.38461992963555384952401394913962473D+02 + pw( 1)= 0.65000398579028612344523107765589882D-03 + pw( 2)= 0.15191959531332206241117313092588598D-02 + pw( 3)= 0.24045082453807306790230685497814259D-02 + pw( 4)= 0.33138053120687689905848031561566419D-02 + pw( 5)= 0.42567825727886822265204218213429885D-02 + pw( 6)= 0.52442786039205566768969009014580993D-02 + pw( 7)= 0.62885672143814305898041140071920990D-02 + pw( 8)= 0.74037469337422375540169840316881048D-02 + pw( 9)= 0.86061856387838724696966695518776445D-02 + pw(10)= 0.99150131408851259539215604893253718D-02 + pw(11)= 0.11352651712245621514495559004367745D-01 + pw(12)= 0.12945367107597103270871461076597819D-01 + pw(13)= 0.14723816350026995714384635716386299D-01 + pw(14)= 0.16723567921281172025119870508991203D-01 + pw(15)= 0.18985579339702223096072943965050960D-01 + pw(16)= 0.21556638092713088509079450535846802D-01 + pw(17)= 0.24489800737865776493594940653820355D-01 + pw(18)= 0.27844892154300886517220254607831059D-01 + pw(19)= 0.31689141187091977270960955143508341D-01 + pw(20)= 0.36098024102012043921594821940947720D-01 + pw(21)= 0.41156366531808391430233301214909888D-01 + pw(22)= 0.46959728529751547723837911521797776D-01 + pw(23)= 0.53616077334483042468992867566071648D-01 + pw(24)= 0.61247744777900280040241146591130434D-01 + pw(25)= 0.69993670734405775857440933512557351D-01 + pw(26)= 0.80011946473114843210018552446024989D-01 + pw(27)= 0.91482687623385083454637087474649108D-01 + pw(28)= 0.10461128275712219242263696233465467D+00 + pw(29)= 0.11963207950030344698937570198730592D+00 + pw(30)= 0.13681258639855890430946376814155933D+00 + pw(31)= 0.15645828725394505928296678986188780D+00 + pw(32)= 0.17891818774495993299205732107037433D+00 + pw(33)= 0.20459124493705478925339465025762632D+00 + pw(34)= 0.23393387292296502681950858384393993D+00 + pw(35)= 0.26746877815837016930288816174815801D+00 + pw(36)= 0.30579546472966641872754674677137451D+00 + pw(37)= 0.34960287592590148622876766322786034D+00 + pw(38)= 0.39968482435244330897498051519252302D+00 + pw(39)= 0.45695914023983362115797429164759424D+00 + pw(40)= 0.52249188739719558018523893289646747D+00 + pw(41)= 0.59752864128661802876559766037133258D+00 + pw(42)= 0.68353583227497725169589493456556878D+00 + pw(43)= 0.78225676673770165676669761907089189D+00 + pw(44)= 0.89578957040518890914540096033762699D+00 + pw(45)= 0.10266987265790718339050961200134187D+01 + pw(46)= 0.11781795916828648269313771182072094D+01 + pw(47)= 0.13543092491251640404184331203718613D+01 + pw(48)= 0.15604436574176829660651085498995850D+01 + pw(49)= 0.18038746446191042006169647129755746D+01 + pw(50)= 0.20949760879837258079028998605170623D+01 + pw(51)= 0.24493410971001376006947638889013500D+01 + pw(52)= 0.28921265037563651103311910556760302D+01 + pw(53)= 0.34679810059374950896183058369361721D+01 + pw(54)= 0.42678969880522956032021408252168488D+01 + pw(55)= 0.55241361350341697933563283440847313D+01 + pw(56)= 0.81763098024027352046055737635386992D+01 +endif +if(kn == 57) then + px( 1)= 0.24868815335455860789772463829278776D-03 + px( 2)= 0.13128708050967756872359979837581267D-02 + px( 3)= 0.32378911411275754242453920014506796D-02 + px( 4)= 0.60424464389102377765134145508034116D-02 + px( 5)= 0.97537140744139523885808398942622953D-02 + px( 6)= 0.14408512776948249945597359956767419D-01 + px( 7)= 0.20054465803924087173957630667856565D-01 + px( 8)= 0.26751478412571506944557766303264093D-01 + px( 9)= 0.34573592603908669936189665703461548D-01 + px(10)= 0.43611265799596486187161513111221049D-01 + px(11)= 0.53974113832253153451922084032312655D-01 + px(12)= 0.65794147604388044446693612420778637D-01 + px(13)= 0.79229515286929499370440341287450519D-01 + px(14)= 0.94468740236461490024906772241244278D-01 + px(15)= 0.11173542497728950196695294189711635D+00 + px(16)= 0.13129338283356472153949413479135588D+00 + px(17)= 0.15345217055234222351399145423334369D+00 + px(18)= 0.17857303240016059517363555848698504D+00 + px(19)= 0.20707532489301037480311227730726369D+00 + px(20)= 0.23944355888568364571611377084388653D+00 + px(21)= 0.27623525598764641501713800314236035D+00 + px(22)= 0.31808985711331926597193028676339348D+00 + px(23)= 0.36573893965362420393418300064924317D+00 + px(24)= 0.42001800250816678509315665005653842D+00 + px(25)= 0.48188007571772045658000477338572683D+00 + px(26)= 0.55241141366947723712332135298933352D+00 + px(27)= 0.63284954422789175567060868091083275D+00 + px(28)= 0.72460397335389008193055638126809504D+00 + px(29)= 0.82927988618893103652768340546531032D+00 + px(30)= 0.94870524097232991512519332944115950D+00 + px(31)= 0.10849617219948828718866932170405878D+01 + px(32)= 0.12404201040143688064542721435857076D+01 + px(33)= 0.14177806871234317057617629235375845D+01 + px(34)= 0.16201195944991234133183611776590787D+01 + px(35)= 0.18509418957597958737177594237457576D+01 + px(36)= 0.21142427406807205051181553190404025D+01 + px(37)= 0.24145779838341929681853920899859986D+01 + px(38)= 0.27571461831926801287738694538112624D+01 + px(39)= 0.31478844142641227801141212544520951D+01 + px(40)= 0.35935811310102033845744816463442230D+01 + px(41)= 0.41020104413877221957804467425939289D+01 + px(42)= 0.46820938302391253474628239056559849D+01 + px(43)= 0.53440978451440606526835982001965101D+01 + px(44)= 0.60998800348958771477889866745643881D+01 + px(45)= 0.69632012890426281929489833882602384D+01 + px(46)= 0.79501320435284778078392913107490947D+01 + px(47)= 0.90795950558462101790938546852706470D+01 + px(48)= 0.10374113225206633587054123726355172D+02 + px(49)= 0.11860876304821553920803819021800250D+02 + px(50)= 0.13573324197835825157192895761042269D+02 + px(51)= 0.15553608911071130591094968547049722D+02 + px(52)= 0.17856644009398398334027651945364308D+02 + px(53)= 0.20557252972621646881571546529989799D+02 + px(54)= 0.23764024975150934121434317734353805D+02 + px(55)= 0.27649965226105594436613117812938989D+02 + px(56)= 0.32535770450738589606101398766605691D+02 + px(57)= 0.39221977092927801982893352173282336D+02 + pw( 1)= 0.63850406831505099738052476325581961D-03 + pw( 2)= 0.14921037986204782494894229786059731D-02 + pw( 3)= 0.23610114029305975624758426308210372D-02 + pw( 4)= 0.32526128541318438561754430100405078D-02 + pw( 5)= 0.41760397770836242676248541207424580D-02 + pw( 6)= 0.51414703399848581702756105177606384D-02 + pw( 7)= 0.61603891216042467763280301070012439D-02 + pw( 8)= 0.72459385496994110627368996655782062D-02 + pw( 9)= 0.84133204261124725154997593044037002D-02 + pw(10)= 0.96802413566072434418561861828501851D-02 + pw(11)= 0.11067394131957919015201993364016383D-01 + pw(12)= 0.12598960733898038415484720369114255D-01 + pw(13)= 0.14303116690973784728070109566032249D-01 + pw(14)= 0.16212514694032192842344075317239116D-01 + pw(15)= 0.18364731480288355645565506966903663D-01 + pw(16)= 0.20802678116001007719911702716244623D-01 + pw(17)= 0.23574997949263392825181094032890825D-01 + pw(18)= 0.26736501608313518430082020410358951D-01 + pw(19)= 0.30348704821972667356360038317766333D-01 + pw(20)= 0.34480535573107543829081934087773551D-01 + pw(21)= 0.39209262449594068155056630028607624D-01 + pw(22)= 0.44621673558809258382775767008026743D-01 + pw(23)= 0.50815515123079612189194336850658426D-01 + pw(24)= 0.57901188015516540741960287519003280D-01 + pw(25)= 0.66003700690181420273674403867109883D-01 + pw(26)= 0.75264885586780504125481386653660311D-01 + pw(27)= 0.85845899110248805481309245561278484D-01 + pw(28)= 0.97930039537017286368934956545788095D-01 + pw(29)= 0.11172593126446497910307506265827343D+00 + pw(30)= 0.12747113783580932628750351291841410D+00 + pw(31)= 0.14543628129357976189998302298577329D+00 + pw(32)= 0.16592976340385132601117576239709225D+00 + pw(33)= 0.18930320741347600799686592812399379D+00 + pw(34)= 0.21595777019635163082254656264589623D+00 + pw(35)= 0.24635151796681218166031807821910603D+00 + pw(36)= 0.28100812005137097745126867432418044D+00 + pw(37)= 0.32052720324236309080623061527301095D+00 + pw(38)= 0.36559683724134186969654741198617812D+00 + pw(39)= 0.41700881002095348086483260374822063D+00 + pw(40)= 0.47567763253365164936264867132129384D+00 + pw(41)= 0.54266463607013432598384544728159736D+00 + pw(42)= 0.61920917583183104118650977432015768D+00 + pw(43)= 0.70676996963437778493319526328106689D+00 + pw(44)= 0.80708121914722438188398403384545040D+00 + pw(45)= 0.92223080476089922545257576125601390D+00 + pw(46)= 0.10547722906765838946076024117162519D+01 + pw(47)= 0.12078902142342952742295563718452734D+01 + pw(48)= 0.13856521611695241681852077225258712D+01 + pw(49)= 0.15934078194682780100598346033527563D+01 + pw(50)= 0.18384490024798631769423787694389017D+01 + pw(51)= 0.21311608735825934018647767200886737D+01 + pw(52)= 0.24871681932345592025950623054739285D+01 + pw(53)= 0.29316978875123326489119117358318389D+01 + pw(54)= 0.35095485536224900059615141531182325D+01 + pw(55)= 0.43120561559297553662108510075135951D+01 + pw(56)= 0.55724340409598520094151224510004002D+01 + pw(57)= 0.82344141971751154156256216589492573D+01 +endif +if(kn == 58) then + px( 1)= 0.24434033394451184831332088274972614D-03 + px( 2)= 0.12898297855758084794990735012824795D-02 + px( 3)= 0.31806717757901164181366233077269143D-02 + px( 4)= 0.59345896357739671411939250656689172D-02 + px( 5)= 0.95773007935647589740131510133326933D-02 + px( 6)= 0.14143599916031750336288464659818307D-01 + px( 7)= 0.19678421471099466334328004427638463D-01 + px( 8)= 0.26238185206866063929609421458766724D-01 + px( 9)= 0.33892482730395244265993385655932016D-01 + px(10)= 0.42726147636423146495787112568669502D-01 + px(11)= 0.52841746547136186264664886791106024D-01 + px(12)= 0.64362519550464307152961734809637400D-01 + px(13)= 0.77435783944099717432445083945677444D-01 + px(14)= 0.92236796334736021064525541096205779D-01 + px(15)= 0.10897304960833762497368263739776640D+00 + px(16)= 0.12788897058127535161532867316020021D+00 + px(17)= 0.14927098966869470910429970787632708D+00 + px(18)= 0.17345298118495822795560109857844673D+00 + px(19)= 0.20082212068477187733619503336787526D+00 + px(20)= 0.23182526492157509727427115110610957D+00 + px(21)= 0.26697601677385238610408934208835716D+00 + px(22)= 0.30686267976111180750915583977482233D+00 + px(23)= 0.35215732947079012938113786105805915D+00 + px(24)= 0.40362623554087235036623549904310772D+00 + px(25)= 0.46214186627722416843651050017300999D+00 + px(26)= 0.52869670779772405928275015906498196D+00 + px(27)= 0.60441913758500942132872353014906915D+00 + px(28)= 0.69059161190173186977923080505129435D+00 + px(29)= 0.78867145852021566336079167775472318D+00 + px(30)= 0.90031461037842010940804732183197484D+00 + px(31)= 0.10274026720013124888351576363117054D+01 + px(32)= 0.11720737797358867139870870772361403D+01 + px(33)= 0.13367578014779413237552856365102921D+01 + px(34)= 0.15242165260123579138727083350791305D+01 + px(35)= 0.17375896232753198202855353912673167D+01 + px(36)= 0.19804473251817493601350778629165516D+01 + px(37)= 0.22568509975250307614430520806868566D+01 + px(38)= 0.25714230695306034625372310347575688D+01 + px(39)= 0.29294281925088078526353597436284922D+01 + px(40)= 0.33368680628505624786084197643705525D+01 + px(41)= 0.38005931431891197468681483744664603D+01 + px(42)= 0.43284356654318541562597981524559705D+01 + px(43)= 0.49293699827199698643746537542247355D+01 + px(44)= 0.56137088447382837752405108085311518D+01 + px(45)= 0.63933479763797142778167198658680473D+01 + px(46)= 0.72820772388702724076103924229296585D+01 + px(47)= 0.82959860217191960273479806666773572D+01 + px(48)= 0.94540058246887833757920831704609262D+01 + px(49)= 0.10778658868884060967669218732823142D+02 + px(50)= 0.12297127123681394967235226888868496D+02 + px(51)= 0.14042840296906789602475373898648457D+02 + px(52)= 0.16057946341424185599637733181299734D+02 + px(53)= 0.18397376144609008660129440553886667D+02 + px(54)= 0.21136019829441359432294268771943368D+02 + px(55)= 0.24382637830465314053970048093781607D+02 + px(56)= 0.28310635883229105824662724848457888D+02 + px(57)= 0.33241688156491882652244516947157465D+02 + px(58)= 0.39979324854730350560380760310991567D+02 + pw( 1)= 0.62733109048409588703595816867276199D-03 + pw( 2)= 0.14657936036557848053358560746138226D-02 + pw( 3)= 0.23188039412803108240913072658971970D-02 + pw( 4)= 0.31933033503497732665912080929284740D-02 + pw( 5)= 0.40979006451946182384357721499095629D-02 + pw( 6)= 0.50421643688026695837303498584333168D-02 + pw( 7)= 0.60368540232012649771110859344923619D-02 + pw( 8)= 0.70942376382161779284891095459836907D-02 + pw( 9)= 0.82284548942576416691892667668446138D-02 + pw(10)= 0.94559203825706345816982076964964040D-02 + pw(11)= 0.10795760712764688238988523701079555D-01 + pw(12)= 0.12270273749594030379848145803872545D-01 + pw(13)= 0.13905392767905254997074541953428596D-01 + pw(14)= 0.15731135789435791324224040917880572D-01 + pw(15)= 0.17782024053941809054144110879593548D-01 + pw(16)= 0.20097465728382493095345225347477278D-01 + pw(17)= 0.22722120795621834323838288686748639D-01 + pw(18)= 0.25706285556264310174964040630928441D-01 + pw(19)= 0.29106352315788903005352971189456581D-01 + pw(20)= 0.32985404610440883148326461811537831D-01 + pw(21)= 0.37413999049710482757623335110546207D-01 + pw(22)= 0.42471166374810272031277917315265662D-01 + pw(23)= 0.48245645122530704885901911499992297D-01 + pw(24)= 0.54837348649234653699497897577026759D-01 + pw(25)= 0.62359063229977042722185054531332040D-01 + pw(26)= 0.70938380290334218998361325437647613D-01 + pw(27)= 0.80719876243549163356586986213036934D-01 + pw(28)= 0.91867565718926819665093909060870324D-01 + pw(29)= 0.10456766632576442199326583277715739D+00 + pw(30)= 0.11903172509910440103985964305294291D+00 + pw(31)= 0.13550016907993941758980848856857065D+00 + pw(32)= 0.15424635639536902570179058036488656D+00 + pw(33)= 0.17558122145590420153365262455970893D+00 + pw(34)= 0.19985863067967063227542616526431877D+00 + pw(35)= 0.22748159641691842637396351126311924D+00 + pw(36)= 0.25890954059680923306019616384762021D+00 + pw(37)= 0.29466686202428234212246962384893466D+00 + pw(38)= 0.33535315105334115324523894344163479D+00 + pw(39)= 0.38165552584862205474730474843490408D+00 + pw(40)= 0.43436375602075320325783351993460999D+00 + pw(41)= 0.49438912403001643077208212506747590D+00 + pw(42)= 0.56278840340808053436660241948913705D+00 + pw(43)= 0.64079498868817746739885701660482042D+00 + pw(44)= 0.72986023375748618090030528388862540D+00 + pw(45)= 0.83170968157393847753518524949795654D+00 + pw(46)= 0.94842152201009860199494152865552426D+00 + pw(47)= 0.10825390752513919957077425880259437D+01 + pw(48)= 0.12372368612987946539691608858506276D+01 + pw(49)= 0.14165738915071239610049524678978730D+01 + pw(50)= 0.16258946038408501735019290717589660D+01 + pw(51)= 0.18724918616166796018089411475069718D+01 + pw(52)= 0.21667631278657422335640585222239716D+01 + pw(53)= 0.25243656135025940850141001255242091D+01 + pw(54)= 0.29705966338517197699939016107637128D+01 + pw(55)= 0.35504038802739003258026682341550974D+01 + pw(56)= 0.43554634619204172726228503521716960D+01 + pw(57)= 0.56199288682781632030749398177962085D+01 + pw(58)= 0.82915991164603216241594990786246181D+01 +endif +if(kn == 59) then + px( 1)= 0.24017044279779292814629839617243450D-03 + px( 2)= 0.12677357311318245204340430050679257D-02 + px( 3)= 0.31258222629100185940076720172210997D-02 + px( 4)= 0.58312499497336295459920293987796347D-02 + px( 5)= 0.94083841957770971773603088185375410D-02 + px( 6)= 0.13890148671899779318807930205580027D-01 + px( 7)= 0.19318997801708323654202530335822085D-01 + px( 8)= 0.25748144263471581820366574897844328D-01 + px( 9)= 0.33243095406506599413902998194831696D-01 + px(10)= 0.41883537206821979634762367168491752D-01 + px(11)= 0.51765600230040784734413248495245234D-01 + px(12)= 0.63004534953194280528857501029401263D-01 + px(13)= 0.75737811688489614503254126944665394D-01 + px(14)= 0.90128644081599828866050645306435530D-01 + px(15)= 0.10636991817245456656640674998882425D+00 + px(16)= 0.12468849734796975917708256298381635D+00 + px(17)= 0.14534987449428730815870543082646967D+00 + px(18)= 0.16866316194781867947963273417414352D+00 + px(19)= 0.19498644807014926777480289949553731D+00 + px(20)= 0.22473260025151679743882657221256944D+00 + px(21)= 0.25837564665252656619516749971741362D+00 + px(22)= 0.29645791158221640514428954276611043D+00 + px(23)= 0.33959810551825596456611237045554791D+00 + px(24)= 0.38850058094912455808801515111809716D+00 + px(25)= 0.44396596557949377138507728624091020D+00 + px(26)= 0.50690338304849533824426898328468191D+00 + px(27)= 0.57834447492359249583873987633512907D+00 + px(28)= 0.65945945037149605268093152551109157D+00 + px(29)= 0.75157541311753698549281538653947915D+00 + px(30)= 0.85619724923675556880411715666490139D+00 + px(31)= 0.97503140385009140556694003085244755D+00 + px(32)= 0.11100129303358947930559321811557219D+01 + px(33)= 0.12633362636366230834317390982154572D+01 + px(34)= 0.14374902524376435821063430688457459D+01 + px(35)= 0.16352980880451859683554892839580029D+01 + px(36)= 0.18599628978969427131632789868222316D+01 + px(37)= 0.21151199396574527695286927569279424D+01 + px(38)= 0.24048965537486799801854034104807364D+01 + px(39)= 0.27339813311614543030519151016136162D+01 + px(40)= 0.31077043634013766751037593199231006D+01 + px(41)= 0.35321310127538594381872851430342547D+01 + px(42)= 0.40141724497033117837380156117688880D+01 + px(43)= 0.45617173671065324231220714157123560D+01 + px(44)= 0.51837909802502262421170872012101186D+01 + px(45)= 0.58907499495757681122761284245660513D+01 + px(46)= 0.66945256944244027786731357744463874D+01 + px(47)= 0.76089345000234897625689957165319158D+01 + px(48)= 0.86500822381865882984046085645261005D+01 + px(49)= 0.98369069052844445818188741380057179D+01 + px(50)= 0.11191928172141132463081500916511436D+02 + px(51)= 0.12742318869317979750127012420299866D+02 + px(52)= 0.14521497812382557030759980660881216D+02 + px(53)= 0.16571609002403235432729979108495388D+02 + px(54)= 0.18947601696583022289134197021834890D+02 + px(55)= 0.21724434819141199485567406823338980D+02 + px(56)= 0.25011043569653595241704228425344735D+02 + px(57)= 0.28981239539754668238717062448027987D+02 + px(58)= 0.33957682861442574052880069059391550D+02 + px(59)= 0.40746915059614258028591599305878563D+02 + pw( 1)= 0.61661580843002818474199774713820435D-03 + pw( 2)= 0.14405704070514471179034636856714158D-02 + pw( 3)= 0.22783669060606481294489644369246025D-02 + pw( 4)= 0.31365358110962922351769249715961996D-02 + pw( 5)= 0.40232040175153947001508790877931979D-02 + pw( 6)= 0.49473803193663214737587320055863858D-02 + pw( 7)= 0.59191612038299517714653169073172258D-02 + pw( 8)= 0.69500182956008661963671339567494311D-02 + pw( 9)= 0.80531272876872073780988243944927837D-02 + pw(10)= 0.92437335233207542919233267694815444D-02 + pw(11)= 0.10539549268603537847804050413706850D-01 + pw(12)= 0.11961173126551657822377551429583253D-01 + pw(13)= 0.13532517058206043238763114996607351D-01 + pw(14)= 0.15281223576152454708908655408040014D-01 + pw(15)= 0.17239057603729693052163344051819092D-01 + pw(16)= 0.19442266458961476989325558184455495D-01 + pw(17)= 0.21931917469546401944382556564398834D-01 + pw(18)= 0.24754242433746641427431268228835038D-01 + pw(19)= 0.27961035103797151420897993567315053D-01 + pw(20)= 0.31610155514745908523448651363558848D-01 + pw(21)= 0.35766190345276654301560678957468317D-01 + pw(22)= 0.40501304167252803397704165063681030D-01 + pw(23)= 0.45896299064297235092911388363815978D-01 + pw(24)= 0.52041886574768681501627601462919698D-01 + pw(25)= 0.59040170127350183777203679746281117D-01 + pw(26)= 0.67006338167425270345951219198202241D-01 + pw(27)= 0.76070575665505304542628473598135760D-01 + pw(28)= 0.86380211831507007368019890251542642D-01 + pw(29)= 0.98102132636666032695755579145966437D-01 + pw(30)= 0.11142549737101106673605436812360518D+00 + pw(31)= 0.12656480912918894407887994146661412D+00 + pw(32)= 0.14376340063800550321586640229070301D+00 + pw(33)= 0.16329741041010875411766070944591531D+00 + pw(34)= 0.18548034139204283612129505343431894D+00 + pw(35)= 0.21066831722133520869183148142544999D+00 + pw(36)= 0.23926618297652114852711313447455029D+00 + pw(37)= 0.27173464220061857023595757633130234D+00 + pw(38)= 0.30859868602838502541227655874922030D+00 + pw(39)= 0.35045766213824028933896241045941680D+00 + pw(40)= 0.39799746405898883695366965337609207D+00 + pw(41)= 0.45200551538659797325171404962194440D+00 + pw(42)= 0.51338951072222608042425293298227461D+00 + pw(43)= 0.58320130681121704145811176462102830D+00 + pw(44)= 0.66266801682533587367291708503855157D+00 + pw(45)= 0.75323338727944523148159051268884834D+00 + pw(46)= 0.85661417022141803759749168951905898D+00 + pw(47)= 0.97487886818484736512715176832767718D+00 + pw(48)= 0.11105607080548591676445817081025167D+01 + pw(49)= 0.12668244926674778382406916781590943D+01 + pw(50)= 0.14477211050240886139248358867709201D+01 + pw(51)= 0.16585903209293941982874152551002654D+01 + pw(52)= 0.19067267819120553533052269878705101D+01 + pw(53)= 0.22025411213234530205031685493395576D+01 + pw(54)= 0.25617240229328119031730477599209231D+01 + pw(55)= 0.30096444005352699081587070335073080D+01 + pw(56)= 0.35914005587611010508966597641663763D+01 + pw(57)= 0.43990110029151586972435889347654638D+01 + pw(58)= 0.56675740145564914236518411943414860D+01 + pw(59)= 0.83489730609712306624049954354151761D+01 +endif +end subroutine wts500 +end MODULE WTS500_MOD diff --git a/src/trans/gpu/external/dir_trans.F90 b/src/trans/gpu/external/dir_trans.F90 new file mode 100755 index 00000000..e3541a5d --- /dev/null +++ b/src/trans/gpu/external/dir_trans.F90 @@ -0,0 +1,523 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& +& LDLATLON,KPROMA,KVSETUV,KVSETSC,KRESOL,KVSETSC3A,KVSETSC3B,KVSETSC2,& +& PGP,PGPUV,PGP3A,PGP3B,PGP2) + + +!**** *DIR_TRANS* - Direct spectral transform (from grid-point to spectral). + +! Purpose. +! -------- +! Interface routine for the direct spectral transform + +!** Interface. +! ---------- +! CALL DIR_TRANS(...) + +! Explicit arguments : All arguments except from PGP are optional. +! -------------------- +! PSPVOR(:,:) - spectral vorticity (output) +! PSPDIV(:,:) - spectral divergence (output) +! PSPSCALAR(:,:) - spectral scalarvalued fields (output) +! PSPSC3A(:,:,:) - alternative to use of PSPSCALAR, see PGP3A below (input) +! PSPSC3B(:,:,:) - alternative to use of PSPSCALAR, see PGP3B below (input) +! PSPSC2(:,:) - alternative to use of PSPSCALAR, see PGP2 below (input) +! LDLATLON - indicating if regular lat-lon is the input data +! KPROMA - required blocking factor for gridpoint output +! KVSETUV(:) - indicating which 'b-set' in spectral space owns a +! vor/div field. Equivalant to NBSETLEV in the IFS. +! The length of KVSETUV should be the GLOBAL number +! of u/v fields which is the dimension of u and v releated +! fields in grid-point space. +! KVESETSC(:) - indicating which 'b-set' in spectral space owns a +! scalar field. As for KVSETUV this argument is required +! if the total number of processors is greater than +! the number of processors used for distribution in +! spectral wave space. +! KVSETSC3A(:) - as KVESETSC for PSPSC3A (distribution on first dimension) +! KVSETSC3B(:) - as KVESETSC for PSPSC3B (distribution on first dimension) +! KVSETSC2(:) - as KVESETSC for PSPSC2 (distribution on first dimension) +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PGP(:,:,:) - gridpoint fields (input) +! PGP need to dimensioned (NPROMA,IF_GP,NGPBLKS) where +! NPROMA is the blocking factor, IF_GP the total number +! of output fields and NGPBLKS the number of NPROMA blocks. +! The ordering of the output fields is as follows (all +! parts are optional depending on the input switches): +! +! u : IF_UV_G fields (if psvor present) +! v : IF_UV_G fields (if psvor present) +! scalar fields : IF_SCALARS_G fields (if pspscalar present) +! +! Here IF_UV_G is the GLOBAL number of u/v fields as given by the length +! of KVSETUV (or by PSPVOR if no split in spectral 'b-set' direction +! IF_SCALARS_G is the GLOBAL number of scalar fields as giben by the +! length of KVESETSC (or by number of fields in PSPSCALAR if no spectral +! 'b-set' split + +! As an alternative to using PGP you can also use a combination of the +! following arrays. The reason for introducing these alternative ways +! of calling DIR_TRANS is to avoid uneccessary copies where your data +! structures don't fit in to the 'PSPVOR,PSPDIV, PSPSCALAR, PGP' layout. +! The use of any of these precludes the use of PGP and vice versa. + +! PGPUV(:,:,:,:) - the 'u-v' related grid-point variables in the order +! described for PGP. The second dimension of PGPUV should +! be the same as the "global" first dimension of +! PSPVOR,PSPDIV (in the IFS this is the number of levels) +! PGPUV need to be dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (u,v) +! PGP3A(:,:,:,:) - grid-point array directly connected with PSPSC3A +! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC3A ) +! PGP3B(:,:,:,:) - grid-point array directly connected with PSPSC3B +! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC3B) +! PGP2(:,:,:) - grid-point array directly connected with PSPSC2 +! dimensioned(NPROMA,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC2 ) +! +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- LTDIR_CTL - control of Legendre transform +! FTDIR_CTL - control of Fourier transform + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NERR, NOUT, LSYNC_TRANS +USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, LATLON, & + & NF_SC2, NF_SC3A, NF_SC3B, & + & NGPBLKS, NPROMA +USE TPM_DISTR ,ONLY : D, NPRTRV, MYSETV +USE TPM_FLT ,ONLY : S +USE TPM_GEOMETRY ,ONLY : G +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE DIR_TRANS_CTL_MOD,ONLY : DIR_TRANS_CTL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +USE MPL_MODULE ,ONLY : MPL_BARRIER +USE TPM_STATS ,ONLY : GSTATS => GSTATS_NVTX + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3A(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3B(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +LOGICAL ,OPTIONAL, INTENT(IN) :: LDLATLON + +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP(:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGPUV(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP3A(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP3B(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP2(:,:,:) + +!ifndef INTERFACE + +! Local variables +INTEGER(KIND=JPIM) :: IUBOUND(4),J +INTEGER(KIND=JPIM) :: IF_UV,IF_UV_G,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP +INTEGER(KIND=JPIM) :: IF_SC2_G,IF_SC3A_G,IF_SC3B_G +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE +INTEGER(KIND=JPIM) :: JMLOC, IF_PP + +! ------------------------------------------------------------------ +IF (LHOOK) CALL DR_HOOK('DIR_TRANS',0,ZHOOK_HANDLE) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF +CALL GSTATS(410,0) +CALL GSTATS(1808,0) +! Set current resolution +CALL SET_RESOL(KRESOL) + +! Set defaults + +IF_UV = 0 +IF_UV_G = 0 +IF_SCALARS = 0 +IF_SCALARS_G = 0 +NF_SC2 = 0 +NF_SC3A = 0 +NF_SC3B = 0 +IF_SC2_G = 0 +IF_SC3A_G = 0 +IF_SC3B_G = 0 +NPROMA = D%NGPTOT +! This is for use in TRGTOL which is shared with adjoint inverse transform +LSCDERS=.FALSE. +LVORGP=.FALSE. +LDIVGP=.FALSE. +LUVDER=.FALSE. +LATLON=.FALSE. + +! Decide requirements + +IF(PRESENT(KVSETUV)) THEN + IF_UV_G = UBOUND(KVSETUV,1) + DO J=1,IF_UV_G + IF(KVSETUV(J) > NPRTRV .OR. KVSETUV(J) < 1) THEN + WRITE(NERR,*) 'DIR_TRANS:KVSETUV(J) > NPRTRV ',J,KVSETUV(J),NPRTRV + CALL ABORT_TRANS('DIR_TRANS:KVSETUV TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETUV(J) == MYSETV) THEN + IF_UV = IF_UV+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPVOR)) THEN + IF_UV = UBOUND(PSPVOR,1) + IF_UV_G = IF_UV +ENDIF + +IF(PRESENT(KVSETSC)) THEN + IF_SCALARS_G = UBOUND(KVSETSC,1) + DO J=1,IF_SCALARS_G + IF(KVSETSC(J) > NPRTRV .OR. KVSETSC(J) < 1) THEN + WRITE(NERR,*) 'DIR_TRANS:KVSETSC(J) > NPRTRV ',J,KVSETSC(J),NPRTRV + CALL ABORT_TRANS('DIR_TRANS:KVSETSC TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETSC(J) == MYSETV) THEN + IF_SCALARS = IF_SCALARS+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPSCALAR)) THEN + IF_SCALARS = UBOUND(PSPSCALAR,1) + IF_SCALARS_G = IF_SCALARS +ENDIF + +IF(PRESENT(KVSETSC2)) THEN + IF(.NOT.PRESENT(PSPSC2)) THEN + CALL ABORT_TRANS('DIR_TRANS:KVSETSC2 BUT NOT PSPSC2') + ENDIF + IF_SC2_G = UBOUND(KVSETSC2,1) + IF_SCALARS_G = IF_SCALARS_G+IF_SC2_G + DO J=1,UBOUND(KVSETSC2,1) + IF(KVSETSC2(J) > NPRTRV .OR. KVSETSC2(J) < 1) THEN + WRITE(NERR,*) 'DIR_TRANS:KVSETSC2(J) > NPRTRV ',J,KVSETSC2(J),NPRTRV + CALL ABORT_TRANS('DIR_TRANS:KVSETSC2 TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETSC2(J) == MYSETV) THEN + IF_SCALARS = IF_SCALARS+1 + NF_SC2 = NF_SC2+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPSC2)) THEN + IF_SC2_G = UBOUND(PSPSC2,1) + NF_SC2 = UBOUND(PSPSC2,1) + IF_SCALARS = IF_SCALARS+NF_SC2 + IF_SCALARS_G = IF_SCALARS_G +IF_SC2_G +ENDIF + +IF(PRESENT(KVSETSC3A)) THEN + IF(.NOT.PRESENT(PSPSC3A)) THEN + CALL ABORT_TRANS('DIR_TRANS:KVSETSC3A BUT NOT PSPSC3A') + ENDIF + IF_SC3A_G = UBOUND(KVSETSC3A,1) + IF_SCALARS_G = IF_SCALARS_G+IF_SC3A_G*UBOUND(PSPSC3A,3) + DO J=1,UBOUND(KVSETSC3A,1) + IF(KVSETSC3A(J) > NPRTRV .OR. KVSETSC3A(J) < 1) THEN + WRITE(NERR,*) 'DIR_TRANS:KVSETSC3A(J) > NPRTRV ',J,KVSETSC3A(J),NPRTRV + CALL ABORT_TRANS('DIR_TRANS:KVSETSC3A TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETSC3A(J) == MYSETV) THEN + IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3A,3) + NF_SC3A = NF_SC3A+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPSC3A)) THEN + IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3A,1)*UBOUND(PSPSC3A,3) + IF_SC3A_G = UBOUND(PSPSC3A,1) + IF_SCALARS_G = IF_SCALARS_G +IF_SC3A_G*UBOUND(PSPSC3A,3) + NF_SC3A = UBOUND(PSPSC3A,1) +ENDIF + +IF(PRESENT(KVSETSC3B)) THEN + IF(.NOT.PRESENT(PSPSC3B)) THEN + CALL ABORT_TRANS('DIR_TRANS:KVSETSC3B BUT NOT PSPSC3B') + ENDIF + IF_SC3B_G = UBOUND(KVSETSC3B,1) + IF_SCALARS_G = IF_SCALARS_G+IF_SC3B_G*UBOUND(PSPSC3B,3) + DO J=1,UBOUND(KVSETSC3B,1) + IF(KVSETSC3B(J) > NPRTRV .OR. KVSETSC3B(J) < 1) THEN + WRITE(NERR,*) 'DIR_TRANS:KVSETSC3B(J) > NPRTRV ',J,KVSETSC3B(J),NPRTRV + CALL ABORT_TRANS('DIR_TRANS:KVSETSC3B TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETSC3B(J) == MYSETV) THEN + IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3B,3) + NF_SC3B = NF_SC3B+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPSC3B)) THEN + IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3B,1)*UBOUND(PSPSC3B,3) + IF_SC3B_G = UBOUND(PSPSC3B,1) + IF_SCALARS_G = IF_SCALARS_G +IF_SC3B_G*UBOUND(PSPSC3B,3) + NF_SC3B = UBOUND(PSPSC3B,1) +ENDIF + +IF(PRESENT(KPROMA)) THEN + NPROMA = KPROMA +ENDIF + +IF(PRESENT(LDLATLON)) THEN + LATLON = LDLATLON +ENDIF + +! Compute derived variables + + +NGPBLKS = (D%NGPTOT-1)/NPROMA+1 + +IF_FS = 2*IF_UV + IF_SCALARS + +IF_GP = 2*IF_UV_G+IF_SCALARS_G + +! Consistency checks + +IF (IF_UV > 0) THEN + IF(.NOT. PRESENT(PSPVOR) ) THEN + CALL ABORT_TRANS('DIR_TRANS : IF_UV > 0 BUT PSPVOR MISSING') + ENDIF + IF(UBOUND(PSPVOR,1) < IF_UV) THEN + WRITE(NERR,*)'DIR_TRANS : UBOUND(PSPVOR,1) < IF_UV ',UBOUND(PSPVOR,1),IF_UV + CALL ABORT_TRANS('DIR_TRANS : PSPVOR TOO SHORT') + ENDIF + IF(.NOT. PRESENT(PSPDIV) ) THEN + CALL ABORT_TRANS('DIR_TRANS : PSPVOR PRESENT BUT PSPDIV MISSING') + ENDIF + IF(UBOUND(PSPDIV,1) /= IF_UV) THEN + WRITE(NERR,*)'DIR_TRANS : UBOUND(PSPDIV,1) < IF_UV ',UBOUND(PSPDIV,1),IF_UV + CALL ABORT_TRANS('DIR_TRANS : INCONSISTENT FIRST DIM. OF PSPVOR AND PSPDIV') + ENDIF +ENDIF + +IF (IF_SCALARS > 0) THEN + IF(PRESENT(PSPSCALAR)) THEN + IF(UBOUND(PSPSCALAR,1) < IF_SCALARS) THEN + WRITE(NERR,*)'DIR_TRANS : UBOUND(PSPSCALAR,1) < IF_SCALARS) ',& + & UBOUND(PSPSCALAR,1),IF_SCALARS + CALL ABORT_TRANS('DIR_TRANS : PSPSCALAR TOO SHORT') + ENDIF + IF(PRESENT(PSPSC3A))THEN + CALL ABORT_TRANS('DIR_TRANS : PSPSCALAR AND PSPSC3A BOTH PRESENT') + ENDIF + IF(PRESENT(PSPSC3B))THEN + CALL ABORT_TRANS('DIR_TRANS : PSPSCALAR AND PSPSC3B BOTH PRESENT') + ENDIF + IF(PRESENT(PSPSC2))THEN + CALL ABORT_TRANS('DIR_TRANS : PSPSCALAR AND PSPSC2 BOTH PRESENT') + ENDIF + ENDIF +ENDIF + +IF(NPRTRV >1) THEN + IF(IF_UV > 0 .AND. .NOT. PRESENT(KVSETUV)) THEN + WRITE(NERR,*)'NPRTRV >1 AND IF_UV > 0 AND NOT PRESENT(KVSETUV)',& + &NPRTRV,IF_UV + CALL ABORT_TRANS('DIR_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF + IF(PRESENT(PSPSCALAR) .AND. .NOT. PRESENT(KVSETSC)) THEN + WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSCALAR) AND NOT PRESENT(KVSETSC)',& + &NPRTRV + CALL ABORT_TRANS('DIR_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF + IF(PRESENT(PSPSC2) .AND. .NOT. PRESENT(KVSETSC2)) THEN + WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC2) AND NOT PRESENT(KVSETSC2)',& + &NPRTRV + CALL ABORT_TRANS('DIR_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF + IF(PRESENT(PSPSC3A) .AND. .NOT. PRESENT(KVSETSC3A)) THEN + WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC3A) AND NOT PRESENT(KVSETSC3A)',& + &NPRTRV + CALL ABORT_TRANS('DIR_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF + IF(PRESENT(PSPSC3B) .AND. .NOT. PRESENT(KVSETSC3B)) THEN + WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC3B) AND NOT PRESENT(KVSETSC3B)',& + &NPRTRV + CALL ABORT_TRANS('DIR_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF +ENDIF + +IF(PRESENT(PGP)) THEN + IUBOUND(1:3)=UBOUND(PGP) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'DIR_TRANS:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('DIR_TRANS:FIRST DIMENSION OF PGP TOO SMALL ') + ENDIF + IF(IUBOUND(2) < IF_GP) THEN + WRITE(NOUT,*)'DIR_TRANS:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),IF_GP + CALL ABORT_TRANS('DIR_TRANS:SECOND DIMENSION OF PGP TOO SMALL ') + ENDIF + IF(IUBOUND(3) < NGPBLKS) THEN + WRITE(NOUT,*)'DIR_TRANS:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),NGPBLKS + CALL ABORT_TRANS('DIR_TRANS:THIRD DIMENSION OF PGP TOO SMALL ') + ENDIF +ENDIF + +IF(PRESENT(PGPUV)) THEN + IF(.NOT.PRESENT(PSPVOR)) THEN + CALL ABORT_TRANS('DIR_TRANS:PSPVOR HAS TO BE PRESENT WHEN PGPUV IS') + ENDIF + IUBOUND=UBOUND(PGPUV) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'DIR_TRANS:FIRST DIM. OF PGPUV TOO SMALL ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('DIR_TRANS:FIRST DIMENSION OF PGPUV TOO SMALL ') + ENDIF + IF(IUBOUND(2) /= IF_UV_G) THEN + WRITE(NOUT,*)'DIR_TRANS:SEC. DIM. OF PGPUV INCONSISTENT ',IUBOUND(2),IF_UV_G + CALL ABORT_TRANS('DIR_TRANS:SEC. DIMENSION OF PGPUV INCONSISTENT ') + ENDIF + IF(IUBOUND(3) < 2) THEN + WRITE(NOUT,*)'DIR_TRANS:THIRD DIM. OF PGPUV TOO SMALL ',IUBOUND(3),2 + CALL ABORT_TRANS('DIR_TRANS:THIRD DIMENSION OF PGPUV TOO SMALL ') + ENDIF + IF(IUBOUND(4) < NGPBLKS) THEN + WRITE(NOUT,*)'DIR_TRANS:FOURTH DIM. OF PGPUV TOO SMALL ',IUBOUND(4),NGPBLKS + CALL ABORT_TRANS('DIR_TRANS:FOURTH DIMENSION OF PGPUV TOO SMALL ') + ENDIF +ENDIF + +IF(PRESENT(PGP2)) THEN + IF(.NOT.PRESENT(PSPSC2)) THEN + CALL ABORT_TRANS('DIR_TRANS:PSPSC2 HAS TO BE PRESENT WHEN PGP2 IS') + ENDIF +ENDIF +IF(IF_SC2_G > 0) THEN + IF(PRESENT(PGP2)) THEN + IUBOUND(1:3)=UBOUND(PGP2) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'DIR_TRANS:FIRST DIM. OF PGP2 TOO SMALL ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('DIR_TRANS:FIRST DIMENSION OF PGP2 TOO SMALL ') + ENDIF + IF(IUBOUND(2) /= IF_SC2_G) THEN + WRITE(NOUT,*)'DIR_TRANS:SEC. DIM. OF PGP2 INCONSISTENT ',IUBOUND(2),IF_SC2_G + CALL ABORT_TRANS('DIR_TRANS:SEC. DIMENSION OF PGP2 INCONSISTENT') + ENDIF + IF(IUBOUND(3) < NGPBLKS) THEN + WRITE(NOUT,*)'DIR_TRANS:THIRD DIM. OF PGP2 TOO SMALL ',IUBOUND(3),NGPBLKS + CALL ABORT_TRANS('DIR_TRANS:THIRD DIMENSION OF PGP2 TOO SMALL ') + ENDIF + ELSE + CALL ABORT_TRANS('DIR_TRANS:PGP2 MISSING') + ENDIF +ENDIF + +IF(PRESENT(PGP3A)) THEN + IF(.NOT.PRESENT(PSPSC3A)) THEN + CALL ABORT_TRANS('DIR_TRANS:PSPSC3A HAS TO BE PRESENT WHEN PGP3A IS') + ENDIF +ENDIF +IF(IF_SC3A_G > 0) THEN + IF(PRESENT(PGP3A)) THEN + IUBOUND=UBOUND(PGP3A) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'DIR_TRANS:FIRST DIM. OF PGP3A TOO SMALL ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('DIR_TRANS:FIRST DIMENSION OF PGP3A TOO SMALL ') + ENDIF + IF(IUBOUND(2) /= IF_SC3A_G) THEN + WRITE(NOUT,*)'DIR_TRANS:SEC DIM. OF PGP3A INCONSISTENT ',IUBOUND(2),IF_SC3A_G + CALL ABORT_TRANS('DIR_TRANS:SEC. DIMENSION OF PGP3A INCONSISTENT ') + ENDIF + IF(IUBOUND(3) /= UBOUND(PSPSC3A,3) ) THEN + WRITE(NOUT,*)'DIR_TRANS:THIRD DIM. OF PGP3A INCONSISTENT ',& + & IUBOUND(3),UBOUND(PSPSC3A,3) + CALL ABORT_TRANS('DIR_TRANS:THIRD DIMENSION OF PGP3A INCONSISTENT ') + ENDIF + IF(IUBOUND(4) < NGPBLKS) THEN + WRITE(NOUT,*)'DIR_TRANS:FOURTH DIM. OF PGP3A TOO SMALL ',IUBOUND(4),NGPBLKS + CALL ABORT_TRANS('DIR_TRANS:FOURTH DIMENSION OF PGP3A TOO SMALL ') + ENDIF + ELSE + CALL ABORT_TRANS('DIR_TRANS:PGP3A MISSING') + ENDIF +ENDIF + +IF(PRESENT(PGP3B)) THEN + IF(.NOT.PRESENT(PSPSC3B)) THEN + CALL ABORT_TRANS('DIR_TRANS:PSPSC3B HAS TO BE PRESENT WHEN PGP3B IS') + ENDIF +ENDIF +IF(IF_SC3B_G > 0) THEN + IF(PRESENT(PGP3B)) THEN + IUBOUND=UBOUND(PGP3B) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'DIR_TRANS:FIRST DIM. OF PGP3B TOO SMALL ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('DIR_TRANS:FIRST DIMENSION OF PGP3B TOO SMALL ') + ENDIF + IF(IUBOUND(2) /= IF_SC3B_G) THEN + WRITE(NOUT,*)'DIR_TRANS:SEC DIM. OF PGP3B INCONSISTENT ',IUBOUND(2),IF_SC3B_G + CALL ABORT_TRANS('DIR_TRANS:SEC. DIMENSION OF PGP3B INCONSISTENT ') + ENDIF + IF(IUBOUND(3) /= UBOUND(PSPSC3B,3) ) THEN + WRITE(NOUT,*)'DIR_TRANS:THIRD DIM. OF PGP3B INCONSISTENT ',& + & IUBOUND(3),UBOUND(PSPSC3B,3) + CALL ABORT_TRANS('DIR_TRANS:THIRD DIMENSION OF PGP3B INCONSISTENT ') + ENDIF + IF(IUBOUND(4) < NGPBLKS) THEN + WRITE(NOUT,*)'DIR_TRANS:FOURTH DIM. OF PGP3B TOO SMALL ',IUBOUND(4),NGPBLKS + CALL ABORT_TRANS('DIR_TRANS:FOURTH DIMENSION OF PGP3B TOO SMALL ') + ENDIF + ELSE + CALL ABORT_TRANS('DIR_TRANS:PGP3B MISSING') + ENDIF +ENDIF +CALL GSTATS(1808,1) + +! ------------------------------------------------------------------ + +CALL DIR_TRANS_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,IF_UV,IF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,& + & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) + + IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(410,1) + IF (LHOOK) CALL DR_HOOK('DIR_TRANS',1,ZHOOK_HANDLE) + +! ------------------------------------------------------------------ +!endif INTERFACE + +END SUBROUTINE DIR_TRANS diff --git a/src/trans/gpu/external/dir_transad.F90 b/src/trans/gpu/external/dir_transad.F90 new file mode 100755 index 00000000..ee2f0723 --- /dev/null +++ b/src/trans/gpu/external/dir_transad.F90 @@ -0,0 +1,145 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE DIR_TRANSAD(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& +& KPROMA,KVSETUV,KVSETSC,KRESOL,KVSETSC3A,KVSETSC3B,KVSETSC2,& +& PGP,PGPUV,PGP3A,PGP3B,PGP2) + + +!**** *DIR_TRANSAD* - Direct spectral transform - adjoint. + +! Purpose. +! -------- +! Interface routine for the direct spectral transform - adjoint + +!** Interface. +! ---------- +! CALL DIR_TRANSAD(...) + +! Explicit arguments : All arguments except from PGP are optional. +! -------------------- +! PSPVOR(:,:) - spectral vorticity (output) +! PSPDIV(:,:) - spectral divergence (output) +! PSPSCALAR(:,:) - spectral scalarvalued fields (output) +! PSPSC3A(:,:,:) - alternative to use of PSPSCALAR, see PGP3A below (input) +! PSPSC3B(:,:,:) - alternative to use of PSPSCALAR, see PGP3B below (input) +! PSPSC2(:,:) - alternative to use of PSPSCALAR, see PGP2 below (input) +! KPROMA - required blocking factor for gridpoint output +! KVSETUV(:) - indicating which 'b-set' in spectral space owns a +! vor/div field. Equivalant to NBSETLEV in the IFS. +! The length of KVSETUV should be the GLOBAL number +! of u/v fields which is the dimension of u and v releated +! fields in grid-point space. +! KVESETSC(:) - indicating which 'b-set' in spectral space owns a +! scalar field. As for KVSETUV this argument is required +! if the total number of processors is greater than +! the number of processors used for distribution in +! spectral wave space. +! KVSETSC3A(:) - as KVESETSC for PSPSC3A (distribution on first dimension) +! KVSETSC3B(:) - as KVESETSC for PSPSC3B (distribution on first dimension) +! KVSETSC2(:) - as KVESETSC for PSPSC2 (distribution on first dimension) +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PGP(:,:,:) - gridpoint fields (input) +! PGP need to dimensioned (NPROMA,IF_GP,NGPBLKS) where +! NPROMA is the blocking factor, IF_GP the total number +! of output fields and NGPBLKS the number of NPROMA blocks. +! The ordering of the output fields is as follows (all +! parts are optional depending on the input switches): +! +! u : IF_UV_G fields (if psvor present) +! v : IF_UV_G fields (if psvor present) +! scalar fields : IF_SCALARS_G fields (if pspscalar present) +! +! Here IF_UV_G is the GLOBAL number of u/v fields as given by the length +! of KVSETUV (or by PSPVOR if no split in spectral 'b-set' direction +! IF_SCALARS_G is the GLOBAL number of scalar fields as giben by the +! length of KVESETSC (or by number of fields in PSPSCALAR if no spectral +! 'b-set' split +! +! As an alternative to using PGP you can also use a combination of the +! following arrays. The reason for introducing these alternative ways +! of calling DIR_TRANS is to avoid uneccessary copies where your data +! structures don't fit in to the 'PSPVOR,PSPDIV, PSPSCALAR, PGP' layout. +! The use of any of these precludes the use of PGP and vice versa. + +! PGPUV(:,:,:,:) - the 'u-v' related grid-point variables in the order +! described for PGP. The second dimension of PGPUV should +! be the same as the "global" first dimension of +! PSPVOR,PSPDIV (in the IFS this is the number of levels) +! PGPUV need to be dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (u,v) +! PGP3A(:,:,:,:) - grid-point array directly connected with PSPSC3A +! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC3A ) +! PGP3B(:,:,:,:) - grid-point array directly connected with PSPSC3B +! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC3B) +! PGP2(:,:,:) - grid-point array directly connected with PSPSC2 +! dimensioned(NPROMA,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC2 ) +! +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- DIR_TRANS_CTLAD - control routine +! + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPVOR(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPDIV(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSCALAR(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3A(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3B(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC2(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL + +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP(:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGPUV(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) + +CALL ABORT_TRANS("DIR_TRANSAD: NOT SUPPORTED FOR GPU") + +END SUBROUTINE DIR_TRANSAD + + diff --git a/src/trans/gpu/external/dist_grid.F90 b/src/trans/gpu/external/dist_grid.F90 new file mode 100755 index 00000000..05f34ee5 --- /dev/null +++ b/src/trans/gpu/external/dist_grid.F90 @@ -0,0 +1,147 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE DIST_GRID(PGPG,KPROMA,KFDISTG,KFROM,KRESOL,PGP,KSORT) + +!**** *DIST_GRID* - Distribute global gridpoint array among processors + +! Purpose. +! -------- +! Interface routine for distributing gridpoint array + +!** Interface. +! ---------- +! CALL DIST_GRID(...) + +! Explicit arguments : +! -------------------- +! PGPG(:,:) - Global spectral array +! KFDISTG - Global number of fields to be distributed +! KPROMA - required blocking factor for gridpoint input +! KFROM(:) - Processor resposible for distributing each field +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PGP(:,:) - Local spectral array +! KSORT (:) - Re-order fields on output +! +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- DIST_GRID_CTL - control routine + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 +! P.Marguinaud : 10-10-14 Add KSORT + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NERR, NOUT +!USE TPM_DIM +USE TPM_DISTR ,ONLY : D, MYPROC, NPROC + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE DIST_GRID_CTL_MOD ,ONLY : DIST_GRID_CTL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGPG(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) , INTENT(IN) :: KFDISTG +INTEGER(KIND=JPIM) , INTENT(IN) :: KFROM(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +REAL(KIND=JPRB) , INTENT(OUT) :: PGP(:,:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KSORT (:) + +!ifndef INTERFACE + +INTEGER(KIND=JPIM) :: IFSEND,J,IUBOUND(3),IPROMA,IGPBLKS +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +! ------------------------------------------------------------------ + +IF (LHOOK) CALL DR_HOOK('DIST_GRID',0,ZHOOK_HANDLE) +! Set current resolution +CALL SET_RESOL(KRESOL) + +IPROMA = D%NGPTOT +IF(PRESENT(KPROMA)) THEN + IPROMA = KPROMA +ENDIF +IGPBLKS = (D%NGPTOT-1)/IPROMA+1 + +IF(UBOUND(KFROM,1) < KFDISTG) THEN + CALL ABORT_TRANS('DIST_GRID: KFROM TOO SHORT!') +ENDIF +IFSEND = 0 +DO J=1,KFDISTG + IF(KFROM(J) < 1 .OR. KFROM(J) > NPROC) THEN + WRITE(NERR,*) 'DIST_GRID:ILLEGAL KFROM VALUE',KFROM(J),J + CALL ABORT_TRANS('DIST_GRID:ILLEGAL KFROM VALUE') + ENDIF + IF(KFROM(J) == MYPROC) IFSEND = IFSEND+1 +ENDDO + +IUBOUND=UBOUND(PGP) +IF(IUBOUND(1) < IPROMA) THEN + WRITE(NOUT,*)'DIST_GRID:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),IPROMA + CALL ABORT_TRANS('DIST_GRID:FIRST DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(2) < KFDISTG) THEN + WRITE(NOUT,*)'DIST_GRID:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),KFDISTG + CALL ABORT_TRANS('DIST_GRID:SECOND DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(3) < IGPBLKS) THEN + WRITE(NOUT,*)'DIST_GRID:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),IGPBLKS + CALL ABORT_TRANS('DIST_GRID:THIRD DIMENSION OF PGP TOO SMALL ') +ENDIF + +IF(IFSEND > 0) THEN + IF(.NOT.PRESENT(PGPG)) THEN + CALL ABORT_TRANS('DIST_GRID:PGPG MISSING') + ENDIF + IF(UBOUND(PGPG,1) < D%NGPTOTG) THEN + CALL ABORT_TRANS('DIST_GRID:FIRST DIMENSION OF PGPG TOO SMALL') + ENDIF + IF(UBOUND(PGPG,2) < IFSEND) THEN + CALL ABORT_TRANS('DIST_GRID:FIRST DIMENSION OF PGPG TOO SMALL') + ENDIF +ENDIF + +IF (PRESENT (KSORT)) THEN + IF (UBOUND (KSORT, 1) /= UBOUND (PGP, 2)) THEN + CALL ABORT_TRANS('DIST_GRID: DIMENSION MISMATCH KSORT, PGP') + ENDIF +ENDIF + +CALL DIST_GRID_CTL(PGPG,KFDISTG,IPROMA,KFROM,PGP,KSORT) + +IF (LHOOK) CALL DR_HOOK('DIST_GRID',1,ZHOOK_HANDLE) +!endif INTERFACE + +! ------------------------------------------------------------------ + +END SUBROUTINE DIST_GRID + diff --git a/src/trans/gpu/external/dist_grid_32.F90 b/src/trans/gpu/external/dist_grid_32.F90 new file mode 100755 index 00000000..ab3d3f00 --- /dev/null +++ b/src/trans/gpu/external/dist_grid_32.F90 @@ -0,0 +1,140 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE DIST_GRID_32(PGPG,KPROMA,KFDISTG,KFROM,KRESOL,PGP) + +!**** *DIST_GRID_32* - Distribute global gridpoint array among processors + +! Purpose. +! -------- +! Interface routine for distributing gridpoint array + +!** Interface. +! ---------- +! CALL DIST_GRID_32(...) + +! Explicit arguments : +! -------------------- +! PGPG(:,:) - Global spectral array +! KFDISTG - Global number of fields to be distributed +! KPROMA - required blocking factor for gridpoint input +! KFROM(:) - Processor resposible for distributing each field +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PGP(:,:) - Local spectral array +! +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- DIST_GRID_32_CTL - control routine + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRM ,JPRB + +!ifndef INTERFACE + +USE TPM_GEN +USE TPM_DIM +USE TPM_DISTR + +USE SET_RESOL_MOD +USE DIST_GRID_32_CTL_MOD +USE ABORT_TRANS_MOD +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRM) ,OPTIONAL, INTENT(IN) :: PGPG(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) , INTENT(IN) :: KFDISTG +INTEGER(KIND=JPIM) , INTENT(IN) :: KFROM(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +REAL(KIND=JPRM) , INTENT(OUT) :: PGP(:,:,:) + +!ifndef INTERFACE + +INTEGER(KIND=JPIM) :: IFSEND,J,IUBOUND(3),IPROMA,IGPBLKS +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +! ------------------------------------------------------------------ + +IF (LHOOK) CALL DR_HOOK('DIST_GRID_32',0,ZHOOK_HANDLE) +! Set current resolution +CALL SET_RESOL(KRESOL) + +IPROMA = D%NGPTOT +IF(PRESENT(KPROMA)) THEN + IPROMA = KPROMA +ENDIF +IGPBLKS = (D%NGPTOT-1)/IPROMA+1 + +IF(UBOUND(KFROM,1) < KFDISTG) THEN + CALL ABORT_TRANS('DIST_GRID_32: KFROM TOO SHORT!') +ENDIF + +IFSEND = 0 +DO J=1,KFDISTG + IF(KFROM(J) < 1 .OR. KFROM(J) > NPROC) THEN + WRITE(NERR,*) 'DIST_GRID_32:ILLEGAL KFROM VALUE',KFROM(J),J + CALL ABORT_TRANS('DIST_GRID_32:ILLEGAL KFROM VALUE') + ENDIF + IF(KFROM(J) == MYPROC) IFSEND = IFSEND+1 +ENDDO + +IUBOUND=UBOUND(PGP) +IF(IUBOUND(1) < IPROMA) THEN + WRITE(NOUT,*)'DIST_GRID_32:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),IPROMA + CALL ABORT_TRANS('DIST_GRID_32:FIRST DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(2) < KFDISTG) THEN + WRITE(NOUT,*)'DIST_GRID_32:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),KFDISTG + CALL ABORT_TRANS('DIST_GRID_32:SECOND DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(3) < IGPBLKS) THEN + WRITE(NOUT,*)'DIST_GRID_32:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),IGPBLKS + CALL ABORT_TRANS('DIST_GRID_32:THIRD DIMENSION OF PGP TOO SMALL ') +ENDIF + +IF(IFSEND > 0) THEN + IF(.NOT.PRESENT(PGPG)) THEN + CALL ABORT_TRANS('DIST_GRID_32:PGPG MISSING') + ENDIF + IF(UBOUND(PGPG,1) < D%NGPTOTG) THEN + CALL ABORT_TRANS('DIST_GRID_32:FIRST DIMENSION OF PGPG TOO SMALL') + ENDIF + IF(UBOUND(PGPG,2) < IFSEND) THEN + CALL ABORT_TRANS('DIST_GRID_32:FIRST DIMENSION OF PGPG TOO SMALL') + ENDIF +ENDIF + + +CALL DIST_GRID_32_CTL(PGPG,KFDISTG,IPROMA,KFROM,PGP) + +IF (LHOOK) CALL DR_HOOK('DIST_GRID_32',1,ZHOOK_HANDLE) +!endif INTERFACE + +! ------------------------------------------------------------------ + +END SUBROUTINE DIST_GRID_32 + diff --git a/src/trans/gpu/external/dist_spec.F90 b/src/trans/gpu/external/dist_spec.F90 new file mode 100755 index 00000000..084c7474 --- /dev/null +++ b/src/trans/gpu/external/dist_spec.F90 @@ -0,0 +1,201 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE DIST_SPEC(PSPECG,KFDISTG,KFROM,KVSET,KRESOL,PSPEC,& + & LDIM1_IS_FLD,KSMAX,KSORT) + +!**** *DIST_SPEC* - Distribute global spectral array among processors + +! Purpose. +! -------- +! Interface routine for distributing spectral array + +!** Interface. +! ---------- +! CALL DIST__SPEC(...) + +! Explicit arguments : +! -------------------- +! PSPECG(:,:) - Global spectral array +! KFDISTG - Global number of fields to be distributed +! KFROM(:) - Processor resposible for distributing each field +! KVSET(:) - "B-Set" for each field +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PSPEC(:,:) - Local spectral array +! KSORT (:) - Re-order fields on output +! +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- DIST_SPEC_CONTROL - control routine + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 +! P.Marguinaud : 10-10-14 Add KSORT + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NERR +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D, NPRTRV, NPRTRW, MYSETV, MYSETW, MYPROC, NPROC + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE DIST_SPEC_CONTROL_MOD ,ONLY : DIST_SPEC_CONTROL +USE SUWAVEDI_MOD ,ONLY : SUWAVEDI +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPECG(:,:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KFDISTG +INTEGER(KIND=JPIM) , INTENT(IN) :: KFROM(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSET(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPEC(:,:) +LOGICAL ,OPTIONAL, INTENT(IN) :: LDIM1_IS_FLD +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KSMAX +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KSORT (:) + +!ifndef INTERFACE + +INTEGER(KIND=JPIM) :: IVSET(KFDISTG) +INTEGER(KIND=JPIM) :: IFSEND,IFRECV,J,IFLD,ICOEFF +INTEGER(KIND=JPIM) :: ISMAX, ISPEC2, ISPEC2_G +INTEGER(KIND=JPIM) :: IPOSSP(NPRTRW+1) +INTEGER(KIND=JPIM),ALLOCATABLE :: IDIM0G(:) +LOGICAL :: LLDIM1_IS_FLD +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +! ------------------------------------------------------------------ + +IF (LHOOK) CALL DR_HOOK('DIST_SPEC',0,ZHOOK_HANDLE) +! Set current resolution +CALL SET_RESOL(KRESOL) + +LLDIM1_IS_FLD = .TRUE. +IF(PRESENT(LDIM1_IS_FLD)) LLDIM1_IS_FLD = LDIM1_IS_FLD + +IF(LLDIM1_IS_FLD) THEN + IFLD = 1 + ICOEFF = 2 +ELSE + IFLD = 2 + ICOEFF = 1 +ENDIF +IF(UBOUND(KFROM,1) < KFDISTG) THEN + CALL ABORT_TRANS('DIST_SPEC: KFROM TOO SHORT!') +ENDIF + +ISMAX = R%NSMAX +IF(PRESENT(KSMAX)) ISMAX = KSMAX +ALLOCATE(IDIM0G(0:ISMAX)) +IF(ISMAX /= R%NSMAX) THEN + CALL SUWAVEDI(ISMAX,ISMAX,NPRTRW,MYSETW,KPOSSP=IPOSSP,KSPEC2=ISPEC2,& + & KDIM0G=IDIM0G) + ISPEC2_G = (ISMAX+1)*(ISMAX+2) +ELSE + ISPEC2 = D%NSPEC2 + ISPEC2_G = R%NSPEC2_G + IPOSSP(:) = D%NPOSSP(:) + IDIM0G(:) = D%NDIM0G(:) +ENDIF +IFSEND = 0 +IFRECV = 0 + +DO J=1,KFDISTG + IF(KFROM(J) < 1 .OR. KFROM(J) > NPROC) THEN + WRITE(NERR,*) 'DIST_SPEC:ILLEGAL KFROM VALUE',KFROM(J),J + CALL ABORT_TRANS('DIST_SPEC:ILLEGAL KFROM VALUE') + ENDIF + IF(KFROM(J) == MYPROC) IFSEND = IFSEND+1 +ENDDO + +IF(IFSEND > 0) THEN + IF(.NOT.PRESENT(PSPECG)) THEN + CALL ABORT_TRANS('DIST_SPEC:PSPECG MISSING') + ENDIF + IF(UBOUND(PSPECG,IFLD) < IFSEND) THEN + WRITE(NERR,*) 'DIST_SPEC: ',IFLD, UBOUND(PSPECG,IFLD), IFSEND + CALL ABORT_TRANS('DIST_SPEC:FIELD DIMENSION OF PSPECG TOO SMALL') + ENDIF + IF(UBOUND(PSPECG,ICOEFF) < ISPEC2_G) THEN + WRITE(NERR,*) 'DIST_SPEC: ',ICOEFF, UBOUND(PSPECG,ICOEFF), ISPEC2_G + CALL ABORT_TRANS('DIST_SPEC:COEFF DIMENSION OF PSPECG TOO SMALL') + ENDIF +ENDIF + +IF(PRESENT(KVSET)) THEN + IF(UBOUND(KVSET,1) < KFDISTG) THEN + CALL ABORT_TRANS('DIST_SPEC: KVSET TOO SHORT!') + ENDIF + DO J=1,KFDISTG + IF(KVSET(J) > NPRTRV .OR. KVSET(J) < 1) THEN + WRITE(NERR,*) 'DIST_SPEC:KVSET(J) > NPRTRV ',J,KVSET(J),NPRTRV + CALL ABORT_TRANS('DIST_SPEC:KVSET CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSET(J) == MYSETV) THEN + IFRECV = IFRECV+1 + ENDIF + ENDDO + IVSET(:) = KVSET(1:KFDISTG) +ELSE + IFRECV = KFDISTG + IVSET(:) = MYSETV +ENDIF + +IF(IFRECV > 0 ) THEN + IF(.NOT.PRESENT(PSPEC)) THEN + CALL ABORT_TRANS('DIST_SPEC: FIELDS TO RECEIVE AND PSPEC NOT PRESENT') + ENDIF + IF(UBOUND(PSPEC,IFLD) < IFRECV) THEN + CALL ABORT_TRANS('DIST_SPEC: FIELD DIMENSION OF PSPEC TOO SMALL') + ENDIF + IF(UBOUND(PSPEC,ICOEFF) < ISPEC2 ) THEN + CALL ABORT_TRANS('DIST_SPEC: COEFF DIMENSION OF PSPEC TOO SMALL') + ENDIF +ENDIF + +IF (PRESENT (KSORT)) THEN + IF (.NOT. PRESENT (PSPEC)) THEN + CALL ABORT_TRANS('DIST_SPEC: KSORT REQUIRES PSPEC') + ENDIF + IF (UBOUND (KSORT, 1) /= UBOUND (PSPEC, IFLD)) THEN + CALL ABORT_TRANS('DIST_SPEC: DIMENSION MISMATCH KSORT, PSPEC') + ENDIF +ENDIF + +CALL DIST_SPEC_CONTROL(PSPECG,KFDISTG,KFROM,IVSET,PSPEC,LLDIM1_IS_FLD,& + & ISMAX,ISPEC2,ISPEC2_G,IPOSSP,IDIM0G,KSORT) + +DEALLOCATE(IDIM0G) + +IF (LHOOK) CALL DR_HOOK('DIST_SPEC',1,ZHOOK_HANDLE) +!endif INTERFACE + +! ------------------------------------------------------------------ + +END SUBROUTINE DIST_SPEC + diff --git a/src/trans/gpu/external/gath_grid.F90 b/src/trans/gpu/external/gath_grid.F90 new file mode 100755 index 00000000..8cfcc40e --- /dev/null +++ b/src/trans/gpu/external/gath_grid.F90 @@ -0,0 +1,140 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE GATH_GRID(PGPG,KPROMA,KFGATHG,KTO,KRESOL,PGP) + +!**** *GATH_GRID* - Gather global gridpoint array from processors + +! Purpose. +! -------- +! Interface routine for gathering gripoint array + +!** Interface. +! ---------- +! CALL GATH_GRID(...) + +! Explicit arguments : +! -------------------- +! PGPG(:,:) - Global gridpoint array +! KFGATHG - Global number of fields to be gathered +! KPROMA - blocking factor for gridpoint input +! KTO(:) - Processor responsible for gathering each field +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PGP(:,:,:) - Local spectral array +! +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- GATH_GRID_CTL - control routine + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NERR, NOUT +!USE TPM_DIM +USE TPM_DISTR ,ONLY : D, MYPROC, NPROC + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE GATH_GRID_CTL_MOD ,ONLY : GATH_GRID_CTL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PGPG(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) , INTENT(IN) :: KFGATHG +INTEGER(KIND=JPIM) , INTENT(IN) :: KTO(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +REAL(KIND=JPRB) , INTENT(IN) :: PGP(:,:,:) + +!ifndef INTERFACE + +INTEGER(KIND=JPIM) :: IFRECV,J,IUBOUND(3),IPROMA,IGPBLKS +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +! ------------------------------------------------------------------ + +IF (LHOOK) CALL DR_HOOK('GATH_GRID',0,ZHOOK_HANDLE) +! Set current resolution +CALL SET_RESOL(KRESOL) + +IPROMA = D%NGPTOT +IF(PRESENT(KPROMA)) THEN + IPROMA = KPROMA +ENDIF +IGPBLKS = (D%NGPTOT-1)/IPROMA+1 + + +IF(UBOUND(KTO,1) < KFGATHG) THEN + CALL ABORT_TRANS('GATH_GRID: KTO TOO SHORT!') +ENDIF + +IFRECV = 0 +DO J=1,KFGATHG + IF(KTO(J) < 1 .OR. KTO(J) > NPROC) THEN + WRITE(NERR,*) 'GATH_GRID:ILLEGAL KTO VALUE',KTO(J),J + CALL ABORT_TRANS('GATH_GRID:ILLEGAL KTO VALUE') + ENDIF + IF(KTO(J) == MYPROC) IFRECV = IFRECV+1 +ENDDO + +IUBOUND=UBOUND(PGP) +IF(IUBOUND(1) < IPROMA) THEN + WRITE(NOUT,*)'GATH_GRID:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),IPROMA + CALL ABORT_TRANS('GATH_GRID:FIRST DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(2) < KFGATHG) THEN + WRITE(NOUT,*)'GATH_GRID:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),KFGATHG + CALL ABORT_TRANS('GATH_GRID:SECOND DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(3) < IGPBLKS) THEN + WRITE(NOUT,*)'GATH_GRID:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),IGPBLKS + CALL ABORT_TRANS('GATH_GRID:THIRD DIMENSION OF PGP TOO SMALL ') +ENDIF + +IF(IFRECV > 0) THEN + IF(.NOT.PRESENT(PGPG)) THEN + CALL ABORT_TRANS('GATH_GRID:PGPG MISSING') + ENDIF + IF(UBOUND(PGPG,1) < D%NGPTOTG) THEN + CALL ABORT_TRANS('GATH_GRID:FIRST DIMENSION OF PGPG TOO SMALL') + ENDIF + IF(UBOUND(PGPG,2) < IFRECV) THEN + CALL ABORT_TRANS('GATH_GRID:SECOND DIMENSION OF PGPG TOO SMALL') + ENDIF +ENDIF + +CALL GATH_GRID_CTL(PGPG,KFGATHG,IPROMA,KTO,PGP) + +IF (LHOOK) CALL DR_HOOK('GATH_GRID',1,ZHOOK_HANDLE) +!endif INTERFACE + +! ------------------------------------------------------------------ + +END SUBROUTINE GATH_GRID + diff --git a/src/trans/gpu/external/gath_grid_32.F90 b/src/trans/gpu/external/gath_grid_32.F90 new file mode 100755 index 00000000..052552f3 --- /dev/null +++ b/src/trans/gpu/external/gath_grid_32.F90 @@ -0,0 +1,140 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE GATH_GRID_32(PGPG,KPROMA,KFGATHG,KTO,KRESOL,PGP) + +!**** *GATH_GRID_32* - Gather global gridpoint array from processors + +! Purpose. +! -------- +! Interface routine for gathering gripoint array + +!** Interface. +! ---------- +! CALL GATH_GRID_32(...) + +! Explicit arguments : +! -------------------- +! PGPG(:,:) - Global gridpoint array +! KFGATHG - Global number of fields to be gathered +! KPROMA - blocking factor for gridpoint input +! KTO(:) - Processor responsible for gathering each field +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PGP(:,:,:) - Local spectral array +! +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- GATH_GRID_32_CTL - control routine + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB, JPRM + +!ifndef INTERFACE + +USE TPM_GEN +USE TPM_DIM +USE TPM_DISTR + +USE SET_RESOL_MOD +USE GATH_GRID_32_CTL_MOD +USE ABORT_TRANS_MOD +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRM) ,OPTIONAL, INTENT(OUT) :: PGPG(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) , INTENT(IN) :: KFGATHG +INTEGER(KIND=JPIM) , INTENT(IN) :: KTO(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +REAL(KIND=JPRM) , INTENT(IN) :: PGP(:,:,:) + +!ifndef INTERFACE + +INTEGER(KIND=JPIM) :: IFRECV,J,IUBOUND(3),IPROMA,IGPBLKS +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +! ------------------------------------------------------------------ + +IF (LHOOK) CALL DR_HOOK('GATH_GRID_32',0,ZHOOK_HANDLE) +! Set current resolution +CALL SET_RESOL(KRESOL) + +IPROMA = D%NGPTOT +IF(PRESENT(KPROMA)) THEN + IPROMA = KPROMA +ENDIF +IGPBLKS = (D%NGPTOT-1)/IPROMA+1 + + +IF(UBOUND(KTO,1) < KFGATHG) THEN + CALL ABORT_TRANS('GATH_GRID_32: KTO TOO SHORT!') +ENDIF + +IFRECV = 0 +DO J=1,KFGATHG + IF(KTO(J) < 1 .OR. KTO(J) > NPROC) THEN + WRITE(NERR,*) 'GATH_GRID_32:ILLEGAL KTO VALUE',KTO(J),J + CALL ABORT_TRANS('GATH_GRID_32:ILLEGAL KTO VALUE') + ENDIF + IF(KTO(J) == MYPROC) IFRECV = IFRECV+1 +ENDDO + +IUBOUND=UBOUND(PGP) +IF(IUBOUND(1) < IPROMA) THEN + WRITE(NOUT,*)'GATH_GRID_32:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),IPROMA + CALL ABORT_TRANS('GATH_GRID_32:FIRST DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(2) < KFGATHG) THEN + WRITE(NOUT,*)'GATH_GRID_32:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),KFGATHG + CALL ABORT_TRANS('GATH_GRID_32:SECOND DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(3) < IGPBLKS) THEN + WRITE(NOUT,*)'GATH_GRID_32:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),IGPBLKS + CALL ABORT_TRANS('GATH_GRID_32:THIRD DIMENSION OF PGP TOO SMALL ') +ENDIF + +IF(IFRECV > 0) THEN + IF(.NOT.PRESENT(PGPG)) THEN + CALL ABORT_TRANS('GATH_GRID_32:PGPG MISSING') + ENDIF + IF(UBOUND(PGPG,1) < D%NGPTOTG) THEN + CALL ABORT_TRANS('GATH_GRID_32:FIRST DIMENSION OF PGPG TOO SMALL') + ENDIF + IF(UBOUND(PGPG,2) < IFRECV) THEN + CALL ABORT_TRANS('GATH_GRID_32:SECOND DIMENSION OF PGPG TOO SMALL') + ENDIF +ENDIF + +CALL GATH_GRID_32_CTL(PGPG,KFGATHG,IPROMA,KTO,PGP) + +IF (LHOOK) CALL DR_HOOK('GATH_GRID_32',1,ZHOOK_HANDLE) +!endif INTERFACE + +! ------------------------------------------------------------------ + +END SUBROUTINE GATH_GRID_32 + diff --git a/src/trans/gpu/external/gath_spec.F90 b/src/trans/gpu/external/gath_spec.F90 new file mode 100755 index 00000000..9af1f81a --- /dev/null +++ b/src/trans/gpu/external/gath_spec.F90 @@ -0,0 +1,194 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE GATH_SPEC(PSPECG,KFGATHG,KTO,KVSET,KRESOL,PSPEC,LDIM1_IS_FLD,KSMAX,LDZA0IP) + +!**** *GATH_SPEC* - Gather global spectral array from processors + +! Purpose. +! -------- +! Interface routine for gathering spectral array + +!** Interface. +! ---------- +! CALL GATH_SPEC(...) + +! Explicit arguments : +! -------------------- +! PSPECG(:,:) - Global spectral array +! KFGATHG - Global number of fields to be gathered +! KTO(:) - Processor responsible for gathering each field +! KVSET(:) - "B-Set" for each field +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PSPEC(:,:) - Local spectral array +! LDZA0IP - Set to zero imaginary part of first coefficients +! +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- GATH_SPEC_CONTROL - control routine + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 +! Modified 03-09-30 Y. Seity, bug correction IFSEND=0 +! Modified 13-10-10 P. Marguinaud add LDZA0IP option +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NERR +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D, NPRTRV, NPRTRW, MYSETV, MYSETW, MYPROC, NPROC + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE GATH_SPEC_CONTROL_MOD ,ONLY : GATH_SPEC_CONTROL +USE SUWAVEDI_MOD ,ONLY : SUWAVEDI +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPECG(:,:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KFGATHG +INTEGER(KIND=JPIM) , INTENT(IN) :: KTO(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSET(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPEC(:,:) +LOGICAL ,OPTIONAL, INTENT(IN) :: LDIM1_IS_FLD +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KSMAX +LOGICAL ,OPTIONAL, INTENT(IN) :: LDZA0IP + +!ifndef INTERFACE + +INTEGER(KIND=JPIM) :: IVSET(KFGATHG) +INTEGER(KIND=JPIM) :: IFRECV,IFSEND,J +INTEGER(KIND=JPIM) :: IFLD,ICOEFF +INTEGER(KIND=JPIM) :: ISMAX, ISPEC2, ISPEC2_G +INTEGER(KIND=JPIM) :: IPOSSP(NPRTRW+1) +INTEGER(KIND=JPIM),ALLOCATABLE :: IDIM0G(:) +LOGICAL :: LLDIM1_IS_FLD +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +! ------------------------------------------------------------------ + +IF (LHOOK) CALL DR_HOOK('GATH_SPEC',0,ZHOOK_HANDLE) +! Set current resolution +CALL SET_RESOL(KRESOL) + +LLDIM1_IS_FLD = .TRUE. +IF(PRESENT(LDIM1_IS_FLD)) LLDIM1_IS_FLD = LDIM1_IS_FLD + +IF(LLDIM1_IS_FLD) THEN + IFLD = 1 + ICOEFF = 2 +ELSE + IFLD = 2 + ICOEFF = 1 +ENDIF +IF(UBOUND(KTO,1) < KFGATHG) THEN + CALL ABORT_TRANS('GATH_SPEC: KTO TOO SHORT!') +ENDIF + +ISMAX = R%NSMAX +IF(PRESENT(KSMAX)) ISMAX = KSMAX +ALLOCATE(IDIM0G(0:ISMAX)) +IF(ISMAX /= R%NSMAX) THEN + CALL SUWAVEDI(ISMAX,ISMAX,NPRTRW,MYSETW,KPOSSP=IPOSSP,KSPEC2=ISPEC2,& + & KDIM0G=IDIM0G) + ISPEC2_G = (ISMAX+1)*(ISMAX+2) +ELSE + ISPEC2 = D%NSPEC2 + ISPEC2_G = R%NSPEC2_G + IPOSSP(:) = D%NPOSSP(:) + IDIM0G(:) = D%NDIM0G(:) +ENDIF + +IFSEND = 0 +IFRECV = 0 +DO J=1,KFGATHG + IF(KTO(J) < 1 .OR. KTO(J) > NPROC) THEN + WRITE(NERR,*) 'GATH_SPEC:ILLEGAL KTO VALUE',KTO(J),J + CALL ABORT_TRANS('GATH_SPEC:ILLEGAL KTO VALUE') + ENDIF + IF(KTO(J) == MYPROC) IFRECV = IFRECV+1 +ENDDO + +IF(IFRECV > 0) THEN + IF(.NOT.PRESENT(PSPECG)) THEN + CALL ABORT_TRANS('GATH_SPEC:PSPECG MISSING') + ENDIF + IF(UBOUND(PSPECG,IFLD) < IFRECV) THEN + WRITE(NERR,*) 'GATH_SPEC: ',IFLD, UBOUND(PSPECG,IFLD), IFRECV + CALL ABORT_TRANS('GATH_SPEC:FIELD DIMENSION OF PSPECG TOO SMALL') + ENDIF + IF(UBOUND(PSPECG,ICOEFF) < ISPEC2_G) THEN + WRITE(NERR,*) 'GATH_SPEC: ',ICOEFF, UBOUND(PSPECG,ICOEFF), ISPEC2_G + CALL ABORT_TRANS('GATH_SPEC:COEFF DIMENSION OF PSPECG TOO SMALL') + ENDIF +ENDIF + +IF(PRESENT(KVSET)) THEN + IF(UBOUND(KVSET,1) < KFGATHG) THEN + CALL ABORT_TRANS('GATH_SPEC: KVSET TOO SHORT!') + ENDIF + DO J=1,KFGATHG + IF(KVSET(J) > NPRTRV .OR. KVSET(J) < 1) THEN + WRITE(NERR,*) 'GATH_SPEC:KVSET(J) > NPRTRV ',J,KVSET(J),NPRTRV + CALL ABORT_TRANS('GATH_SPEC:KVSET CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSET(J) == MYSETV) THEN + IFSEND = IFSEND+1 + ENDIF + ENDDO + IVSET(:) = KVSET(1:KFGATHG) +ELSEIF(NPRTRV > 1) THEN + WRITE(NERR,*) 'GATH_SPEC:KVSET MISSING, NPRTRV ',NPRTRV + CALL ABORT_TRANS('GATH_SPEC:KVSET MISSING, NPRTRV > 1') +ELSE + IFSEND = KFGATHG + IVSET(:) = 1 +ENDIF + +IF(IFSEND > 0 ) THEN + IF(.NOT.PRESENT(PSPEC)) THEN + CALL ABORT_TRANS('GATH_SPEC: FIELDS TO RECIEVE AND PSPEC NOT PRESENT') + ENDIF + IF(UBOUND(PSPEC,IFLD) < IFSEND) THEN + CALL ABORT_TRANS('GATH_SPEC: FIELD DIMENSION OF PSPEC TOO SMALL') + ENDIF + IF(UBOUND(PSPEC,ICOEFF) < ISPEC2 ) THEN + CALL ABORT_TRANS('GATH_SPEC: COEFF DIMENSION OF PSPEC TOO SMALL') + ENDIF +ENDIF + +CALL GATH_SPEC_CONTROL(PSPECG,KFGATHG,KTO,IVSET,PSPEC,LLDIM1_IS_FLD,& + & ISMAX,ISPEC2,ISPEC2_G,IPOSSP,IDIM0G,LDZA0IP) +DEALLOCATE(IDIM0G) + +IF (LHOOK) CALL DR_HOOK('GATH_SPEC',1,ZHOOK_HANDLE) +!endif INTERFACE + +! ------------------------------------------------------------------ + +END SUBROUTINE GATH_SPEC + diff --git a/src/trans/gpu/external/get_current.F90 b/src/trans/gpu/external/get_current.F90 new file mode 100755 index 00000000..802701ad --- /dev/null +++ b/src/trans/gpu/external/get_current.F90 @@ -0,0 +1,67 @@ +! (C) Copyright 2012- Meteo-France. +! (C) Copyright 2012- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE GET_CURRENT(KRESOL,LDLAM) + +!**** *GET_CURRENT* - Extract current information from the transform package + +! Purpose. +! -------- +! Interface routine for extracting current information from the T.P. + +!** Interface. +! ---------- +! CALL GET_CURRENT(...) + +! Explicit arguments : (all optional) +! -------------------- +! KRESOL - Current resolution +! LDLAM - .T. if the corresponding resolution is LAM, .F. if it is global + +! Method. +! ------- + +! Externals. None +! ---------- + +! Author. +! ------- +! Ryad El Khatib *Meteo-France* + +! Modifications. +! -------------- +! Original : 24-Aug-2012 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM + +!ifndef INTERFACE + +USE TPM_GEN +USE TPM_GEOMETRY + +!endif INTERFACE + +IMPLICIT NONE + +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT) :: KRESOL +LOGICAL ,OPTIONAL,INTENT(OUT) :: LDLAM + +!ifndef INTERFACE + +! Get current resolution +IF (PRESENT(KRESOL)) KRESOL= NCUR_RESOL +IF (PRESENT(LDLAM)) LDLAM = G%LAM + + +!endif INTERFACE + +END SUBROUTINE GET_CURRENT diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 new file mode 100755 index 00000000..7b8e5e00 --- /dev/null +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -0,0 +1,482 @@ +! (C) Copyright 2008- ECMWF. +! (C) Copyright 2008- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) + + +!**** *GPNORM_TRANS* - calculate grid-point norms + +! Purpose. +! -------- +! calculate grid-point norms using a 2 stage (NPRTRV,NPRTRW) communication rather +! than an approach using a more expensive global gather collective communication + +!** Interface. +! ---------- +! CALL GPNORM_TRANS(...) + +! Explicit arguments : +! -------------------- +! PGP(:,:,:) - gridpoint fields (input) +! PGP is dimensioned (NPROMA,KFIELDS,NGPBLKS) where +! NPROMA is the blocking factor, KFIELDS the total number +! of fields and NGPBLKS the number of NPROMA blocks. +! KFIELDS - number of fields (input) +! (these do not have to be just levels) +! KPROMA - required blocking factor (input) +! PAVE - average (output) +! PMIN - minimum (input/output) +! PMAX - maximum (input/output) +! LDAVE_ONLY - T : PMIN and PMAX already contain local MIN and MAX +! KRESOL - resolution tag (optional) +! default assumes first defined resolution +! + +! Author. +! ------- +! George Mozdzynski *ECMWF* + +! Modifications. +! -------------- +! Original : 19th Sept 2008 +! R. El Khatib 07-08-2009 Optimisation directive for NEC + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB , JPRD +USE PARKIND_ECTRANS ,ONLY : JPRBT + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NOUT +USE TPM_DIM ,ONLY : R +USE TPM_TRANS ,ONLY : LGPNORM, NGPBLKS, NPROMA +USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, D_NSTAGTF,D_NPTRLS, MYPROC +USE TPM_GEOMETRY ,ONLY : G,G_NLOEN +USE TPM_FIELDS ,ONLY : F,F_RW +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE SET2PE_MOD ,ONLY : SET2PE +USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, JP_BLOCKING_STANDARD +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +USE TRGTOL_MOD +USE TPM_TRANS, ONLY:GROWING_ALLOCATION +USE BUFFERED_ALLOCATOR_MOD + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,INTENT(IN) :: PGP(:,:,:) +REAL(KIND=JPRB) ,INTENT(OUT) :: PAVE(:) +REAL(KIND=JPRB) ,INTENT(INOUT) :: PMIN(:) +REAL(KIND=JPRB) ,INTENT(INOUT) :: PMAX(:) +INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS +INTEGER(KIND=JPIM),INTENT(IN) :: KPROMA +LOGICAL ,INTENT(IN) :: LDAVE_ONLY +INTEGER(KIND=JPIM),OPTIONAL, INTENT(IN) :: KRESOL + +!ifndef INTERFACE + +! Local variables +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE +INTEGER(KIND=JPIM) :: IUBOUND(4) +INTEGER(KIND=JPIM) :: IVSET(KFIELDS) +INTEGER(KIND=JPIM),ALLOCATABLE :: IVSETS(:) +INTEGER(KIND=JPIM),ALLOCATABLE :: IVSETG(:,:) +!GPU +REAL(KIND=JPRBT) :: V +REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZAVE(:,:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGL(:,:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGL(:,:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGPN(:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGPN(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZAVEG(:,:) +REAL(KIND=JPRB),ALLOCATABLE :: ZMING(:) +REAL(KIND=JPRB),ALLOCATABLE :: ZMAXG(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZSND(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZRCV(:) +INTEGER(KIND=JPIM) :: J,JGL,IGL,JL,JF,IF_GP,IF_SCALARS_G,IF_FS,JSETV,JSETW,IWLATS,JMAX +INTEGER(KIND=JPIM) :: IPROC,ITAG,ILEN,ILENR,IBEG,IEND,IND +TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR +TYPE(TRGTOL_HANDLE) :: HTRGTOL +!INTEGER(KIND=JPIM) :: iunit + +! ------------------------------------------------------------------ +IF (LHOOK) CALL DR_HOOK('GPNORM_TRANS',0,ZHOOK_HANDLE) + +! Set current resolution +CALL SET_RESOL(KRESOL) + +! Set defaults + +NPROMA = KPROMA +NGPBLKS = (D%NGPTOT-1)/NPROMA+1 + +! Consistency checks + +IUBOUND(1:3)=UBOUND(PGP) +IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'GPNORM_TRANS:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('GPNORM_TRANS:FIRST DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(2) < KFIELDS) THEN + WRITE(NOUT,*)'GPNORM_TRANS:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),KFIELDS + CALL ABORT_TRANS('GPNORM_TRANS:SECOND DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(3) < NGPBLKS) THEN + WRITE(NOUT,*)'GPNORM_TRANS:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),NGPBLKS + CALL ABORT_TRANS('GPNORM_TRANS:THIRD DIMENSION OF PGP TOO SMALL ') +ENDIF + + +IF_GP=KFIELDS +IF_SCALARS_G=KFIELDS + +IF_FS=0 +DO J=1,KFIELDS + IVSET(J)=MOD(J-1,NPRTRV)+1 + IF(IVSET(J)==MYSETV)THEN + IF_FS=IF_FS+1 + ENDIF +ENDDO +ALLOCATE(ZAVE(IF_FS,R%NDGL)) +ALLOCATE(ZMINGL(IF_FS,R%NDGL)) +ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) +ALLOCATE(ZMINGPN(IF_FS)) +ALLOCATE(ZMAXGPN(IF_FS)) + +ZAVE = 0._JPRBT +ZMINGL = 0._JPRBT +ZMAXGL = 0._JPRBT +ZMINGPN = 0._JPRBT +ZMAXGPN = 0._JPRBT +!$ACC DATA COPY(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) + +ALLOCATE(IVSETS(NPRTRV)) +IVSETS(:)=0 +DO J=1,KFIELDS + IVSETS(IVSET(J))=IVSETS(IVSET(J))+1 +ENDDO +ALLOCATE(IVSETG(NPRTRV,MAXVAL(IVSETS(:)))) +IVSETG(:,:)=0 +IVSETS(:)=0 +DO J=1,KFIELDS + IVSETS(IVSET(J))=IVSETS(IVSET(J))+1 + IVSETG(IVSET(J),IVSETS(IVSET(J)))=J +ENDDO + + +!iunit=300+myproc +!DO JF=1,IF_GP +! write(iunit,*) 'PGP field=',JF,PGP(1,JF,1),PGP(NPROMA,JF,1),PGP(1,JF,NGPBLKS) +!ENDDO + +ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() +HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,IF_GP,IF_FS) +CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) + +LGPNORM=.TRUE. +CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,IF_FS,IF_GP,0,IF_SCALARS_G,& + & KVSETSC=IVSET,PGP=PGP) +LGPNORM=.FALSE. + +IBEG=1 +IEND=D%NDGL_FS + +CALL GSTATS(1429,0) +IF( IF_FS > 0 )THEN + + !$ACC DATA & + !$ACC& PRESENT(F,F_RW) & + !$ACC& PRESENT(D,D_NSTAGTF,D_NPTRLS,G_NLOEN) + + !$ACC KERNELS + DO JF=1,IF_FS + V = PREEL_REAL(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1))) + ZMINGL(JF,IBEG:IEND)=HUGE(1_JPRBT) + ZMAXGL(JF,IBEG:IEND)=-HUGE(1_JPRBT) + ENDDO + !$ACC END KERNELS + +! FIRST DO SUMS IN EACH FULL LATITUDE + + !$ACC KERNELS + DO JGL=1,D%NDGL_FS + IGL = D_NPTRLS(MYSETW) + JGL - 1 + DO JF=1,IF_FS + ZAVE(JF,JGL)=0.0_JPRB + !$ACC loop + DO JL=1,G_NLOEN(IGL) + V = PREEL_REAL(IF_FS*D%NSTAGTF(JGL)+(JF-1)*(D%NSTAGTF(JGL+1)-D%NSTAGTF(JGL))+JL) + ZAVE(JF,JGL)=ZAVE(JF,JGL)+V + ZMINGL(JF,JGL)=MIN(ZMINGL(JF,JGL),V) + ZMAXGL(JF,JGL)=MAX(ZMAXGL(JF,JGL),V) + ENDDO + ENDDO + ENDDO + !$ACC END KERNELS + + !$ACC KERNELS + DO JF=1,IF_FS + ZMINGPN(JF)=MINVAL(ZMINGL(JF,IBEG:IEND)) + ZMAXGPN(JF)=MAXVAL(ZMAXGL(JF,IBEG:IEND)) + ENDDO + !$ACC END KERNELS + + !$ACC KERNELS + DO JGL=IBEG,IEND + IGL = D_NPTRLS(MYSETW) + JGL - 1 + DO JF=1,IF_FS + ZAVE(JF,JGL)=ZAVE(JF,JGL)*F_RW(IGL)/G_NLOEN(IGL) + !write(iunit,*) 'aver inside ',JF,IF_FS,IGL,ZAVE(JF,JGL), F_RW(IGL), G_NLOEN(IGL),ZMINGPN(JF),ZMAXGPN(JF) + ENDDO + ENDDO + !$ACC END KERNELS + +!$ACC end data + +ENDIF +!$ACC end data +CALL GSTATS(1429,1) + +! from here rest on CPU + +! IT IS IMPORTANT THAT SUMS ARE NOW DONE IN LATITUDE ORDER +ALLOCATE(ZAVEG(R%NDGL,KFIELDS)) +ALLOCATE(ZMING(KFIELDS)) +ALLOCATE(ZMAXG(KFIELDS)) + +ZAVEG(:,:)=0.0_JPRD +DO JF=1,IF_FS + DO JGL=IBEG,IEND + IGL = D%NPTRLS(MYSETW) + JGL - 1 + ZAVEG(IGL,IVSETG(MYSETV,JF))=ZAVEG(IGL,IVSETG(MYSETV,JF))+ZAVE(JF,JGL) + ENDDO +ENDDO + +IF(LDAVE_ONLY)THEN + ZMING(:)=PMIN(:) + ZMAXG(:)=PMAX(:) +ELSE + DO JF=1,IF_FS + ZMING(IVSETG(MYSETV,JF))=ZMINGPN(JF) + ZMAXG(IVSETG(MYSETV,JF))=ZMAXGPN(JF) + ENDDO +ENDIF + +! RECEIVE ABOVE FROM OTHER NPRTRV SETS FOR SAME LATS BUT DIFFERENT FIELDS +ITAG=123 + +CALL GSTATS(815,0) + +IF( MYSETV==1 )THEN + + DO JSETV=2,NPRTRV + IF(LDAVE_ONLY)THEN + ILEN=D%NDGL_FS*IVSETS(JSETV)+2*KFIELDS + ELSE + ILEN=(D%NDGL_FS+2)*IVSETS(JSETV) + ENDIF + IF(ILEN > 0)THEN + ALLOCATE(ZRCV(ILEN)) + CALL SET2PE(IPROC,0,0,MYSETW,JSETV) + CALL MPL_RECV(ZRCV(:),KSOURCE=NPRCIDS(IPROC),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILENR,CDSTRING='GPNORM_TRANS:V') + IF(ILENR /= ILEN)THEN + CALL ABOR1('GPNORM_TRANS:ILENR /= ILEN') + ENDIF + IND=0 + DO JF=1,IVSETS(JSETV) + DO JGL=IBEG,IEND + IGL = D%NPTRLS(MYSETW) + JGL - 1 + IND=IND+1 + ZAVEG(IGL,IVSETG(JSETV,JF))=ZRCV(IND) + ENDDO + IF(.NOT.LDAVE_ONLY)THEN + IND=IND+1 + ZMING(IVSETG(JSETV,JF))=ZRCV(IND) + IND=IND+1 + ZMAXG(IVSETG(JSETV,JF))=ZRCV(IND) + ENDIF + ENDDO + IF(LDAVE_ONLY)THEN + DO JF=1,KFIELDS + IND=IND+1 + ZMING(JF)=MIN(ZMING(JF),REAL(ZRCV(IND),JPRB)) + IND=IND+1 + ZMAXG(JF)=MAX(ZMAXG(JF),REAL(ZRCV(IND),JPRB)) + ENDDO + ENDIF + DEALLOCATE(ZRCV) + ENDIF + ENDDO + +ELSE + + IF(LDAVE_ONLY)THEN + ILEN=D%NDGL_FS*IVSETS(MYSETV)+2*KFIELDS + ELSE + ILEN=(D%NDGL_FS+2)*IVSETS(MYSETV) + ENDIF + IF(ILEN > 0)THEN + CALL SET2PE(IPROC,0,0,MYSETW,1) + ALLOCATE(ZSND(ILEN)) + IND=0 + DO JF=1,IF_FS + DO JGL=IBEG,IEND + IGL = D%NPTRLS(MYSETW) + JGL - 1 + IND=IND+1 + ZSND(IND)=ZAVEG(IGL,IVSETG(MYSETV,JF)) + ENDDO + IF(.NOT.LDAVE_ONLY)THEN + IND=IND+1 + ZSND(IND)=ZMING(IVSETG(MYSETV,JF)) + IND=IND+1 + ZSND(IND)=ZMAXG(IVSETG(MYSETV,JF)) + ENDIF + ENDDO + IF(LDAVE_ONLY)THEN + DO JF=1,KFIELDS + IND=IND+1 + ZSND(IND)=PMIN(JF) + IND=IND+1 + ZSND(IND)=PMAX(JF) + ENDDO + ENDIF + CALL MPL_SEND(ZSND(:),KDEST=NPRCIDS(IPROC),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,CDSTRING='GPNORM_TRANS:V') + DEALLOCATE(ZSND) + ENDIF + +ENDIF + +! FINALLY RECEIVE CONTRIBUTIONS FROM OTHER NPRTRW SETS + +IF( MYSETV == 1 )THEN + + IF( MYSETW == 1 )THEN + + DO JSETW=2,NPRTRW + IWLATS=D%NULTPP(JSETW) + IBEG=1 + IEND=IWLATS + IF(LDAVE_ONLY)THEN + ILEN=IWLATS*KFIELDS+2*KFIELDS + ELSE + ILEN=(IWLATS+2)*KFIELDS + ENDIF + IF(ILEN > 0 )THEN + ALLOCATE(ZRCV(ILEN)) + CALL SET2PE(IPROC,0,0,JSETW,1) + CALL MPL_RECV(ZRCV(:),KSOURCE=NPRCIDS(IPROC),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILENR,CDSTRING='GPNORM_TRANS:W') + IF(ILENR /= ILEN)THEN + CALL ABOR1('GPNORM_TRANS:ILENR /= ILEN') + ENDIF + IND=0 + DO JF=1,KFIELDS + DO JGL=IBEG,IEND + IGL = D%NPTRLS(JSETW) + JGL - 1 + IND=IND+1 + ZAVEG(IGL,JF)=ZRCV(IND) + ENDDO + IF(.NOT.LDAVE_ONLY)THEN + IND=IND+1 + ZMING(JF)=MIN(ZMING(JF),ZRCV(IND)) + IND=IND+1 + ZMAXG(JF)=MAX(ZMAXG(JF),ZRCV(IND)) + ENDIF + ENDDO + IF(LDAVE_ONLY)THEN + DO JF=1,KFIELDS + IND=IND+1 + ZMING(JF)=MIN(ZMING(JF),ZRCV(IND)) + IND=IND+1 + ZMAXG(JF)=MAX(ZMAXG(JF),ZRCV(IND)) + ENDDO + ENDIF + DEALLOCATE(ZRCV) + ENDIF + ENDDO + + ELSE + + IF(LDAVE_ONLY)THEN + ILEN=D%NDGL_FS*KFIELDS+2*KFIELDS + ELSE + ILEN=(D%NDGL_FS+2)*KFIELDS + ENDIF + IF(ILEN > 0)THEN + CALL SET2PE(IPROC,0,0,1,1) + ALLOCATE(ZSND(ILEN)) + IND=0 + DO JF=1,KFIELDS + DO JGL=IBEG,IEND + IGL = D%NPTRLS(MYSETW) + JGL - 1 + IND=IND+1 + ZSND(IND)=ZAVEG(IGL,JF) + ENDDO + IF(.NOT.LDAVE_ONLY)THEN + IND=IND+1 + ZSND(IND)=ZMING(JF) + IND=IND+1 + ZSND(IND)=ZMAXG(JF) + ENDIF + ENDDO + IF(LDAVE_ONLY)THEN + DO JF=1,KFIELDS + IND=IND+1 + ZSND(IND)=ZMING(JF) + IND=IND+1 + ZSND(IND)=ZMAXG(JF) + ENDDO + ENDIF + CALL MPL_SEND(ZSND(:),KDEST=NPRCIDS(IPROC),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,CDSTRING='GPNORM_TRANS:V') + DEALLOCATE(ZSND) + ENDIF + + ENDIF + +ENDIF + +CALL GSTATS(815,1) + +IF( MYSETW == 1 .AND. MYSETV == 1 )THEN + + PAVE(:)=0.0_JPRB + DO JGL=1,R%NDGL + PAVE(:)=PAVE(:)+REAL(ZAVEG(JGL,:),JPRB) + ENDDO + + PMIN(:)=ZMING(:) + PMAX(:)=ZMAXG(:) + +ENDIF + +DEALLOCATE(ZAVEG) +DEALLOCATE(ZMING) +DEALLOCATE(ZMAXG) +DEALLOCATE(IVSETS) +DEALLOCATE(IVSETG) + +IF (LHOOK) CALL DR_HOOK('GPNORM_TRANS',1,ZHOOK_HANDLE) + +! ------------------------------------------------------------------ + +!endif INTERFACE + + +END SUBROUTINE GPNORM_TRANS diff --git a/src/trans/gpu/external/gpnorm_trans_gpu.F90 b/src/trans/gpu/external/gpnorm_trans_gpu.F90 new file mode 100755 index 00000000..8d941f78 --- /dev/null +++ b/src/trans/gpu/external/gpnorm_trans_gpu.F90 @@ -0,0 +1,551 @@ +! (C) Copyright 2008- ECMWF. +! (C) Copyright 2008- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) + + +!**** *GPNORM_TRANS_GPU* - calculate grid-point norms + +! Purpose. +! -------- +! calculate grid-point norms using a 2 stage (NPRTRV,NPRTRW) communication rather +! than an approach using a more expensive global gather collective communication + +!** Interface. +! ---------- +! CALL GPNORM_TRANS(...) + +! Explicit arguments : +! -------------------- +! PGP(:,:,:) - gridpoint fields (input) +! PGP is dimensioned (NPROMA,KFIELDS,NGPBLKS) where +! NPROMA is the blocking factor, KFIELDS the total number +! of fields and NGPBLKS the number of NPROMA blocks. +! KFIELDS - number of fields (input) +! (these do not have to be just levels) +! KPROMA - required blocking factor (input) +! PAVE - average (output) +! PMIN - minimum (input/output) +! PMAX - maximum (input/output) +! LDAVE_ONLY - T : PMIN and PMAX already contain local MIN and MAX +! KRESOL - resolution tag (optional) +! default assumes first defined resolution +! + +! Author. +! ------- +! George Mozdzynski *ECMWF* + +! Modifications. +! -------------- +! Original : 19th Sept 2008 +! R. El Khatib 07-08-2009 Optimisation directive for NEC + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB , JPRD +USE PARKIND_ECTRANS ,ONLY : JPRBT + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NOUT +USE TPM_DIM ,ONLY : R +USE TPM_TRANS ,ONLY : LGPNORM, NGPBLKS, NPROMA +USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, D_NSTAGTF,D_NPTRLS, MYPROC +USE TPM_GEOMETRY ,ONLY : G,G_NLOEN,G_NLOEN_MAX +USE TPM_FIELDS ,ONLY : F_RW +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE TRGTOL_MOD ,ONLY : TRGTOL +USE SET2PE_MOD ,ONLY : SET2PE +USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, JP_BLOCKING_STANDARD +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,INTENT(IN) :: PGP(:,:,:) +REAL(KIND=JPRB) ,INTENT(OUT) :: PAVE(:) +REAL(KIND=JPRB) ,INTENT(INOUT) :: PMIN(:) +REAL(KIND=JPRB) ,INTENT(INOUT) :: PMAX(:) +INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS +INTEGER(KIND=JPIM),INTENT(IN) :: KPROMA +LOGICAL ,INTENT(IN) :: LDAVE_ONLY +INTEGER(KIND=JPIM),OPTIONAL, INTENT(IN) :: KRESOL + +!ifndef INTERFACE + +! Local variables +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE +INTEGER(KIND=JPIM) :: IUBOUND(4) +INTEGER(KIND=JPIM) :: IVSET(KFIELDS) +INTEGER(KIND=JPIM),ALLOCATABLE :: IVSETS(:) +INTEGER(KIND=JPIM),ALLOCATABLE :: IVSETG(:,:) +!GPU +REAL(KIND=JPRBT) :: V +REAL(KIND=JPRBT),ALLOCATABLE,SAVE :: ZGTF(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZAVE(:,:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGL(:,:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGL(:,:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMIN(:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMAX(:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGPN(:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGPN(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZAVEG(:,:) +REAL(KIND=JPRB),ALLOCATABLE :: ZMING(:) +REAL(KIND=JPRB),ALLOCATABLE :: ZMAXG(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZSND(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZRCV(:) +INTEGER(KIND=JPIM) :: J,JGL,IGL,JL,JF,IF_GP,IF_SCALARS_G,IF_FS,JSETV,JSETW,IWLATS,JMAX +INTEGER(KIND=JPIM) :: IPROC,ITAG,ILEN,ILENR,IBEG,IEND,IND +!INTEGER(KIND=JPIM) :: iunit + +! ------------------------------------------------------------------ +IF (LHOOK) CALL DR_HOOK('GPNORM_TRANS',0,ZHOOK_HANDLE) + +! Set current resolution +CALL SET_RESOL(KRESOL) + +! Set defaults + +NPROMA = KPROMA +NGPBLKS = (D%NGPTOT-1)/NPROMA+1 + +! Consistency checks + +IUBOUND(1:3)=UBOUND(PGP) +IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'GPNORM_TRANS:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('GPNORM_TRANS:FIRST DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(2) < KFIELDS) THEN + WRITE(NOUT,*)'GPNORM_TRANS:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),KFIELDS + CALL ABORT_TRANS('GPNORM_TRANS:SECOND DIMENSION OF PGP TOO SMALL ') +ENDIF +IF(IUBOUND(3) < NGPBLKS) THEN + WRITE(NOUT,*)'GPNORM_TRANS:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),NGPBLKS + CALL ABORT_TRANS('GPNORM_TRANS:THIRD DIMENSION OF PGP TOO SMALL ') +ENDIF + + +IF_GP=KFIELDS +IF_SCALARS_G=0 + +IF_FS=0 +DO J=1,KFIELDS + IVSET(J)=MOD(J-1,NPRTRV)+1 + IF(IVSET(J)==MYSETV)THEN + IF_FS=IF_FS+1 + ENDIF +ENDDO +IF (.NOT. ALLOCATED(ZAVE)) THEN + ALLOCATE(ZAVE(IF_FS,R%NDGL)) + ALLOCATE(ZMINGL(IF_FS,R%NDGL)) + ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) + ALLOCATE(ZMINGPN(IF_FS)) + ALLOCATE(ZMAXGPN(IF_FS)) + + ZAVE = 0._JPRBT + ZMINGL = 0._JPRBT + ZMAXGL = 0._JPRBT + ZMINGPN = 0._JPRBT + ZMAXGPN = 0._JPRBT +#ifdef ACCGPU + !$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) +#endif + IF (.NOT. ALLOCATED(ZGTF)) THEN + ALLOCATE(ZGTF(IF_FS*D%NLENGTF)) + WRITE(NOUT,*)'ZGTF :',SIZE(ZGTF) +#ifdef ACCGPU + !$ACC ENTER DATA CREATE(ZGTF) +#endif + ENDIF +ENDIF + +ALLOCATE(IVSETS(NPRTRV)) +IVSETS(:)=0 +DO J=1,KFIELDS + IVSETS(IVSET(J))=IVSETS(IVSET(J))+1 +ENDDO +ALLOCATE(IVSETG(NPRTRV,MAXVAL(IVSETS(:)))) +IVSETG(:,:)=0 +IVSETS(:)=0 +DO J=1,KFIELDS + IVSETS(IVSET(J))=IVSETS(IVSET(J))+1 + IVSETG(IVSET(J),IVSETS(IVSET(J)))=J +ENDDO + + +! done in setup_trans +LGPNORM=.TRUE. +!!FIXME +!!CALL TRGTOL_CUDAAWARE(ZGTF,IF_FS,IF_GP,IVSET,PGP=PGP) +LGPNORM=.FALSE. + +! ZGTF is now on GPU + +IBEG=1 +IEND=D%NDGL_FS + +CALL GSTATS(1429,0) +IF( IF_FS > 0 )THEN + +#ifdef ACCGPU + !$ACC DATA & + !$ACC& COPY(F_RW) & + !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) & + !$ACC& PRESENT(ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) +#endif +#ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:F_RW,D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) & + !$OMP& MAP(PRESENT,ALLOC:ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif +#ifdef ACCGPU + !$ACC KERNELS +#endif + DO JF=1,IF_FS + V = ZGTF(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1))) + ZMINGL(JF,IBEG:IEND)=HUGE(1_JPRBT) + ZMAXGL(JF,IBEG:IEND)=-HUGE(1_JPRBT) + ENDDO +#ifdef ACCGPU + !$ACC END KERNELS +#endif + +! FIRST DO SUMS IN EACH FULL LATITUDE + +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif +#ifdef ACCGPU + !$ACC KERNELS +#endif + DO JGL=1,D%NDGL_FS + IGL = D_NPTRLS(MYSETW) + JGL - 1 + DO JF=1,IF_FS + ZAVE(JF,JGL)=0.0_JPRBT +#ifdef ACCGPU + !$ACC LOOP +#endif + DO JL=1,G_NLOEN(IGL) + V = ZGTF(IF_FS*D%NSTAGTF(JGL)+(JF-1)*(D%NSTAGTF(JGL+1)-D%NSTAGTF(JGL))+JL) + ZAVE(JF,JGL)=ZAVE(JF,JGL)+V + ZMINGL(JF,JGL)=MIN(ZMINGL(JF,JGL),V) + ZMAXGL(JF,JGL)=MAX(ZMAXGL(JF,JGL),V) + ENDDO + ENDDO + ENDDO +#ifdef ACCGPU + !$ACC END KERNELS +#endif + +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif +#ifdef ACCGPU + !$ACC KERNELS +#endif + DO JF=1,IF_FS + ZMINGPN(JF)=MINVAL(ZMINGL(JF,IBEG:IEND)) + ZMAXGPN(JF)=MAXVAL(ZMAXGL(JF,IBEG:IEND)) + ENDDO +#ifdef ACCGPU + !$ACC END KERNELS +#endif + +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif +#ifdef ACCGPU + !$ACC KERNELS +#endif + DO JGL=IBEG,IEND + IGL = D_NPTRLS(MYSETW) + JGL - 1 + DO JF=1,IF_FS + ZAVE(JF,JGL)=ZAVE(JF,JGL)*F_RW(IGL)/G_NLOEN(IGL) + !write(iunit,*) 'aver inside ',JF,IF_FS,IGL,ZAVE(JF,JGL), F_RW(IGL), G_NLOEN(IGL),ZMINGPN(JF),ZMAXGPN(JF) + ENDDO + ENDDO +#ifdef ACCGPU + !$ACC END KERNELS +#endif + +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif +#ifdef ACCGPU +!$ACC END DATA +#endif + +#ifdef ACCGPU +!$ACC UPDATE HOST(ZAVE) +#endif +#ifdef OMPGPU +!$OMP TARGET UPDATE FROM(ZAVE) +#endif +#ifdef ACCGPU +!$ACC UPDATE HOST(ZMINGPN) +#endif +#ifdef OMPGPU +!$OMP TARGET UPDATE FROM(ZMINGPN) +#endif +#ifdef ACCGPU +!$ACC UPDATE HOST(ZMAXGPN) +#endif +#ifdef OMPGPU +!$OMP TARGET UPDATE FROM(ZMAXGPN) +#endif +#ifdef ACCGPU +!$ACC WAIT +#endif +#ifdef OMPGPU +!$OMP BARRIER +#endif + +ENDIF +CALL GSTATS(1429,1) + +! from here rest on CPU + +! IT IS IMPORTANT THAT SUMS ARE NOW DONE IN LATITUDE ORDER +ALLOCATE(ZAVEG(R%NDGL,KFIELDS)) +ALLOCATE(ZMING(KFIELDS)) +ALLOCATE(ZMAXG(KFIELDS)) + +ZAVEG(:,:)=0.0_JPRD +DO JF=1,IF_FS + DO JGL=IBEG,IEND + IGL = D%NPTRLS(MYSETW) + JGL - 1 + ZAVEG(IGL,IVSETG(MYSETV,JF))=ZAVEG(IGL,IVSETG(MYSETV,JF))+ZAVE(JF,JGL) + ENDDO +ENDDO + +IF(LDAVE_ONLY)THEN + ZMING(:)=PMIN(:) + ZMAXG(:)=PMAX(:) +ELSE + DO JF=1,IF_FS + ZMING(IVSETG(MYSETV,JF))=ZMINGPN(JF) + ZMAXG(IVSETG(MYSETV,JF))=ZMAXGPN(JF) + ENDDO +ENDIF + +! RECEIVE ABOVE FROM OTHER NPRTRV SETS FOR SAME LATS BUT DIFFERENT FIELDS +ITAG=123 + +CALL GSTATS(815,0) + +IF( MYSETV==1 )THEN + + DO JSETV=2,NPRTRV + IF(LDAVE_ONLY)THEN + ILEN=D%NDGL_FS*IVSETS(JSETV)+2*KFIELDS + ELSE + ILEN=(D%NDGL_FS+2)*IVSETS(JSETV) + ENDIF + IF(ILEN > 0)THEN + ALLOCATE(ZRCV(ILEN)) + CALL SET2PE(IPROC,0,0,MYSETW,JSETV) + CALL MPL_RECV(ZRCV(:),KSOURCE=NPRCIDS(IPROC),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILENR,CDSTRING='GPNORM_TRANS:V') + IF(ILENR /= ILEN)THEN + CALL ABOR1('GPNORM_TRANS:ILENR /= ILEN') + ENDIF + IND=0 + DO JF=1,IVSETS(JSETV) + DO JGL=IBEG,IEND + IGL = D%NPTRLS(MYSETW) + JGL - 1 + IND=IND+1 + ZAVEG(IGL,IVSETG(JSETV,JF))=ZRCV(IND) + ENDDO + IF(.NOT.LDAVE_ONLY)THEN + IND=IND+1 + ZMING(IVSETG(JSETV,JF))=ZRCV(IND) + IND=IND+1 + ZMAXG(IVSETG(JSETV,JF))=ZRCV(IND) + ENDIF + ENDDO + IF(LDAVE_ONLY)THEN + DO JF=1,KFIELDS + IND=IND+1 + ZMING(JF)=MIN(ZMING(JF),REAL(ZRCV(IND),JPRB)) + IND=IND+1 + ZMAXG(JF)=MAX(ZMAXG(JF),REAL(ZRCV(IND),JPRB)) + ENDDO + ENDIF + DEALLOCATE(ZRCV) + ENDIF + ENDDO + +ELSE + + IF(LDAVE_ONLY)THEN + ILEN=D%NDGL_FS*IVSETS(MYSETV)+2*KFIELDS + ELSE + ILEN=(D%NDGL_FS+2)*IVSETS(MYSETV) + ENDIF + IF(ILEN > 0)THEN + CALL SET2PE(IPROC,0,0,MYSETW,1) + ALLOCATE(ZSND(ILEN)) + IND=0 + DO JF=1,IF_FS + DO JGL=IBEG,IEND + IGL = D%NPTRLS(MYSETW) + JGL - 1 + IND=IND+1 + ZSND(IND)=ZAVEG(IGL,IVSETG(MYSETV,JF)) + ENDDO + IF(.NOT.LDAVE_ONLY)THEN + IND=IND+1 + ZSND(IND)=ZMING(IVSETG(MYSETV,JF)) + IND=IND+1 + ZSND(IND)=ZMAXG(IVSETG(MYSETV,JF)) + ENDIF + ENDDO + IF(LDAVE_ONLY)THEN + DO JF=1,KFIELDS + IND=IND+1 + ZSND(IND)=PMIN(JF) + IND=IND+1 + ZSND(IND)=PMAX(JF) + ENDDO + ENDIF + CALL MPL_SEND(ZSND(:),KDEST=NPRCIDS(IPROC),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,CDSTRING='GPNORM_TRANS:V') + DEALLOCATE(ZSND) + ENDIF + +ENDIF + +! FINALLY RECEIVE CONTRIBUTIONS FROM OTHER NPRTRW SETS + +IF( MYSETV == 1 )THEN + + IF( MYSETW == 1 )THEN + + DO JSETW=2,NPRTRW + IWLATS=D%NULTPP(JSETW) + IBEG=1 + IEND=IWLATS + IF(LDAVE_ONLY)THEN + ILEN=IWLATS*KFIELDS+2*KFIELDS + ELSE + ILEN=(IWLATS+2)*KFIELDS + ENDIF + IF(ILEN > 0 )THEN + ALLOCATE(ZRCV(ILEN)) + CALL SET2PE(IPROC,0,0,JSETW,1) + CALL MPL_RECV(ZRCV(:),KSOURCE=NPRCIDS(IPROC),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILENR,CDSTRING='GPNORM_TRANS:W') + IF(ILENR /= ILEN)THEN + CALL ABOR1('GPNORM_TRANS:ILENR /= ILEN') + ENDIF + IND=0 + DO JF=1,KFIELDS + DO JGL=IBEG,IEND + IGL = D%NPTRLS(JSETW) + JGL - 1 + IND=IND+1 + ZAVEG(IGL,JF)=ZRCV(IND) + ENDDO + IF(.NOT.LDAVE_ONLY)THEN + IND=IND+1 + ZMING(JF)=MIN(ZMING(JF),ZRCV(IND)) + IND=IND+1 + ZMAXG(JF)=MAX(ZMAXG(JF),ZRCV(IND)) + ENDIF + ENDDO + IF(LDAVE_ONLY)THEN + DO JF=1,KFIELDS + IND=IND+1 + ZMING(JF)=MIN(ZMING(JF),ZRCV(IND)) + IND=IND+1 + ZMAXG(JF)=MAX(ZMAXG(JF),ZRCV(IND)) + ENDDO + ENDIF + DEALLOCATE(ZRCV) + ENDIF + ENDDO + + ELSE + + IF(LDAVE_ONLY)THEN + ILEN=D%NDGL_FS*KFIELDS+2*KFIELDS + ELSE + ILEN=(D%NDGL_FS+2)*KFIELDS + ENDIF + IF(ILEN > 0)THEN + CALL SET2PE(IPROC,0,0,1,1) + ALLOCATE(ZSND(ILEN)) + IND=0 + DO JF=1,KFIELDS + DO JGL=IBEG,IEND + IGL = D%NPTRLS(MYSETW) + JGL - 1 + IND=IND+1 + ZSND(IND)=ZAVEG(IGL,JF) + ENDDO + IF(.NOT.LDAVE_ONLY)THEN + IND=IND+1 + ZSND(IND)=ZMING(JF) + IND=IND+1 + ZSND(IND)=ZMAXG(JF) + ENDIF + ENDDO + IF(LDAVE_ONLY)THEN + DO JF=1,KFIELDS + IND=IND+1 + ZSND(IND)=ZMING(JF) + IND=IND+1 + ZSND(IND)=ZMAXG(JF) + ENDDO + ENDIF + CALL MPL_SEND(ZSND(:),KDEST=NPRCIDS(IPROC),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,CDSTRING='GPNORM_TRANS:V') + DEALLOCATE(ZSND) + ENDIF + + ENDIF + +ENDIF + +CALL GSTATS(815,1) + +IF( MYSETW == 1 .AND. MYSETV == 1 )THEN + + PAVE(:)=0.0_JPRB + DO JGL=1,R%NDGL + PAVE(:)=PAVE(:)+REAL(ZAVEG(JGL,:),JPRB) + ENDDO + + PMIN(:)=ZMING(:) + PMAX(:)=ZMAXG(:) + +ENDIF + +!DEALLOCATE(ZGTF) +!DEALLOCATE(ZAVE) +!DEALLOCATE(ZMIN) +!DEALLOCATE(ZMAX) +DEALLOCATE(ZAVEG) +DEALLOCATE(ZMING) +DEALLOCATE(ZMAXG) +DEALLOCATE(IVSETS) +DEALLOCATE(IVSETG) + +IF (LHOOK) CALL DR_HOOK('GPNORM_TRANS',1,ZHOOK_HANDLE) + +! ------------------------------------------------------------------ + +!endif INTERFACE + + +END SUBROUTINE GPNORM_TRANS_GPU diff --git a/src/trans/gpu/external/ini_spec_dist.F90 b/src/trans/gpu/external/ini_spec_dist.F90 new file mode 100755 index 00000000..7dfe61d5 --- /dev/null +++ b/src/trans/gpu/external/ini_spec_dist.F90 @@ -0,0 +1,96 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE INI_SPEC_DIST(KSMAX,KTMAX,KPRTRW,KMYSETW,KASM0,KSPOLEGL,KPROCM,& + &KUMPP,KSPEC,KSPEC2,KSPEC2MX,KPOSSP,KMYMS,KPTRMS,KALLMS) + + +!**** *INI_SPEC_DIST* - Initialize spectral wave distribution + +! Purpose. +! -------- +! Initialize arrays controlling spectral wave distribution + +!** Interface. +! ---------- +! CALL INI_SPEC_DIST(...) + +! Explicit arguments : +! -------------------- +! KSMAX - spectral truncation required +! KTMAX - Overtruncation for KSMAX (input) +! KPRTRW - Number of processors in A-direction (input) +! KMYSETW - A-set for present processor (input) +! KASM0 - Offsets for spectral waves (output) +! KSPOLEGL - Local version of NSPOLEG (output) +! KPROCM - Where a certain spectral wave belongs (output) +! KUMPP - Number of spectral waves on this PE (output) +! KSPEC - Local version on NSPEC (output) +! KSPEC2 - Local version on NSPEC2 (output) +! KSPEC2MX - Maximum KSPEC2 across PEs (output) +! KPOSSP - Global spectral fields partitioning (output) +! KMYMS - This PEs spectral zonal wavenumbers (output) +! KPTRMS - Pointer to the first wave number of a given a-set (output) +! KALLMS - Wave numbers for all wave-set concatenated together +! to give all wave numbers in wave-set order (output) + +! Implicit arguments : NONE +! -------------------- + +! Method. +! ------- +! See documentation + +! Externals. SUWAVEDI +! ---------- +! Original : 00-03-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB +!ifndef INTERFACE +USE SUWAVEDI_MOD ,ONLY : SUWAVEDI +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +!endif INTERFACE + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(IN) :: KSMAX +INTEGER(KIND=JPIM),INTENT(IN) :: KTMAX +INTEGER(KIND=JPIM),INTENT(IN) :: KPRTRW +INTEGER(KIND=JPIM),INTENT(IN) :: KMYSETW +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KSPEC +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KSPEC2 +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KSPEC2MX +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KSPOLEGL + +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KASM0(0:KSMAX) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KPROCM(0:KSMAX) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KUMPP(KPRTRW) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KPOSSP(KPRTRW+1) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KMYMS(KSMAX+1) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KPTRMS(KPRTRW) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KALLMS(KSMAX+1) + +!ifndef INTERFACE +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +! ------------------------------------------------------------------ + +IF (LHOOK) CALL DR_HOOK('INI_SPEC_DIST',0,ZHOOK_HANDLE) + +CALL SUWAVEDI(KSMAX,KTMAX,KPRTRW,KMYSETW,KASM0,KSPOLEGL,KPROCM,& + &KUMPP,KSPEC,KSPEC2,KSPEC2MX,KPOSSP,KMYMS,KPTRMS,KALLMS) + +IF (LHOOK) CALL DR_HOOK('INI_SPEC_DIST',1,ZHOOK_HANDLE) + +!endif INTERFACE + +END SUBROUTINE INI_SPEC_DIST diff --git a/src/trans/gpu/external/inv_trans.F90 b/src/trans/gpu/external/inv_trans.F90 new file mode 100755 index 00000000..a0ac0c31 --- /dev/null +++ b/src/trans/gpu/external/inv_trans.F90 @@ -0,0 +1,645 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& + & FSPGL_PROC,& + & LDSCDERS,LDVORGP,LDDIVGP,LDUVDER,LDLATLON,KPROMA,KVSETUV,KVSETSC,KRESOL,& + & KVSETSC3A,KVSETSC3B,KVSETSC2,& + & PGP,PGPUV,PGP3A,PGP3B,PGP2) + +!**** *INV_TRANS* - Inverse spectral transform. + +! Purpose. +! -------- +! Interface routine for the inverse spectral transform + +!** Interface. +! ---------- +! CALL INV_TRANS(...) + +! Explicit arguments : All arguments are optional. +! -------------------- +! PSPVOR(:,:) - spectral vorticity (input) +! PSPDIV(:,:) - spectral divergence (input) +! PSPSCALAR(:,:) - spectral scalarvalued fields (input) +! PSPSC3A(:,:,:) - alternative to use of PSPSCALAR, see PGP3A below (input) +! PSPSC3B(:,:,:) - alternative to use of PSPSCALAR, see PGP3B below (input) +! PSPSC2(:,:) - alternative to use of PSPSCALAR, see PGP2 below (input) +! FSPGL_PROC - external procedure to be executed in fourier space +! before transposition +! LDSCDERS - indicating if derivatives of scalar variables are req. +! LDVORGP - indicating if grid-point vorticity is req. +! LDDIVGP - indicating if grid-point divergence is req. +! LDUVDER - indicating if E-W derivatives of u and v are req. +! LDLATLON - indicating if regular lat-lon output requested +! KPROMA - required blocking factor for gridpoint output +! KVSETUV(:) - indicating which 'b-set' in spectral space owns a +! vor/div field. Equivalant to NBSETLEV in the IFS. +! The length of KVSETUV should be the GLOBAL number +! of u/v fields which is the dimension of u and v releated +! fields in grid-point space. +! KVESETSC(:) - indicating which 'b-set' in spectral space owns a +! scalar field. As for KVSETUV this argument is required +! if the total number of processors is greater than +! the number of processors used for distribution in +! spectral wave space. +! KVSETSC3A(:) - as KVESETSC for PSPSC3A (distribution on first dimension) +! KVSETSC3B(:) - as KVESETSC for PSPSC3B (distribution on first dimension) +! KVSETSC2(:) - as KVESETSC for PSPSC2 (distribution on first dimension) +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PGP(:,:,:) - gridpoint fields (output) +! PGP need to dimensioned (NPROMA,IF_GP,NGPBLKS) where +! NPROMA is the blocking factor, IF_GP the total number +! of output fields and NGPBLKS the number of NPROMA blocks. +! The ordering of the output fields is as follows (all +! parts are optional depending on the input switches): +! +! vorticity : IF_UV_G fields (if psvor present and LDVORGP) +! divergence : IF_UV_G fields (if psvor present and LDDIVGP) +! u : IF_UV_G fields (if psvor present) +! v : IF_UV_G fields (if psvor present) +! scalar fields : IF_SCALARS_G fields (if pspscalar present) +! N-S derivative of scalar fields : IF_SCALARS_G fields (if pspscalar +! present and LDSCDERS) +! E-W derivative of u : IF_UV_G fields (if psvor present and and LDUVDER) +! E-W derivative of v : IF_UV_G fields (if psvor present and and LDUVDER) +! E-W derivative of scalar fields : IF_SCALARS_G fields (if pspscalar +! present and LDSCDERS) +! +! Here IF_UV_G is the GLOBAL number of u/v fields as given by the length +! of KVSETUV (or by PSPVOR if no split in spectral 'b-set' direction +! IF_SCALARS_G is the GLOBAL number of scalar fields as giben by the +! length of KVESETSC (or by number of fields in PSPSCALAR if no spectral +! 'b-set' split + +! As an alternative to using PGP you can also use a combination of the +! following arrays. The reason for introducing these alternative ways +! of calling INV_TRANS is to avoid uneccessary copies where your data +! structures don't fit in to the 'PSPVOR,PSPDIV, PSPSCALAR, PGP' layout. +! The use of any of these precludes the use of PGP and vice versa. + +! PGPUV(:,:,:,:) - the 'u-v' related grid-point variables in the order +! described for PGP. The second dimension of PGPUV should +! be the same as the "global" first dimension of +! PSPVOR,PSPDIV (in the IFS this is the number of levels) +! PGPUV need to be dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (u,v,vor,div ...) +! PGP3A(:,:,:,:) - grid-point array directly connected with PSPSC3A +! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC3A if no derivatives, 3 times that with der.) +! PGP3B(:,:,:,:) - grid-point array directly connected with PSPSC3B +! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC3B if no derivatives, 3 times that with der.) +! PGP2(:,:,:) - grid-point array directly connected with PSPSC2 +! dimensioned(NPROMA,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC2 if no derivatives, 3 times that with der.) +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- LTINV_CTL - control of Legendre transform +! FTINV_CTL - control of Fourier transform + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 +! 26-02-03 Mats Hamrud & Gabor Radnoti : modified condition for scalar fields +! and derivatives (IF_SCALARS_G) + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NERR, NOUT, NPROMATR, LSYNC_TRANS +!USE TPM_DIM +USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, LATLON, & + & NF_SC2, NF_SC3A, NF_SC3B, NGPBLKS, NPROMA + +USE TPM_FLT ,ONLY : S +USE TPM_GEOMETRY ,ONLY : G +!USE TPM_GEOMETRY +!USE TPM_FIELDS +!USE TPM_FFT +USE TPM_DISTR ,ONLY : D, NPRTRV, MYSETV, MYPROC +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE INV_TRANS_CTL_MOD ,ONLY : INV_TRANS_CTL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE MPL_MODULE ,ONLY : MPL_BARRIER +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +USE TPM_STATS ,ONLY : GSTATS => GSTATS_NVTX + +#ifdef _OPENACC +USE OPENACC +!USE ACCEL_LIB !only for NVIDIA GPUs +#endif + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPVOR(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPDIV(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSCALAR(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3A(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3B(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC2(:,:) +LOGICAL ,OPTIONAL, INTENT(IN) :: LDSCDERS +LOGICAL ,OPTIONAL, INTENT(IN) :: LDVORGP +LOGICAL ,OPTIONAL, INTENT(IN) :: LDDIVGP +LOGICAL ,OPTIONAL, INTENT(IN) :: LDUVDER +LOGICAL ,OPTIONAL, INTENT(IN) :: LDLATLON +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +EXTERNAL FSPGL_PROC +OPTIONAL FSPGL_PROC +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP(:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGPUV(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) + +!ifndef INTERFACE + +! Local varaibles +INTEGER(KIND=JPIM) :: IUBOUND(4),J +INTEGER(KIND=JPIM) :: IF_UV,IF_UV_G,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP,IF_OUT_LT +INTEGER(KIND=JPIM) :: IF_SCDERS,IF_UV_PAR +INTEGER(KIND=JPIM) :: IF_SC2_G,IF_SC3A_G2,IF_SC3A_G3,IF_SC3B_G2,IF_SC3B_G3 +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE +INTEGER(KIND=JPIM) :: JMLOC +INTEGER(KIND=JPIM) :: UNIT_NO,IDEVTYPE,NUMDEVS,MYGPU,MYNUM +! ------------------------------------------------------------------ + +UNIT_NO=300+MYPROC +CALL FLUSH(UNIT_NO) + +IF (LHOOK) CALL DR_HOOK('INV_TRANS',0,ZHOOK_HANDLE) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF + +CALL GSTATS(420,0) +CALL GSTATS(1807,0) +! Set current resolution +CALL SET_RESOL(KRESOL) + +! Set defaults + +LVORGP = .FALSE. +LDIVGP = .FALSE. +LUVDER = .FALSE. +LATLON =.FALSE. +IF_UV = 0 +IF_UV_G = 0 +IF_UV_PAR = 0 +IF_SCALARS = 0 +IF_SCALARS_G = 0 +IF_SCDERS = 0 +NF_SC2 = 0 +NF_SC3A = 0 +NF_SC3B = 0 +IF_SC2_G = 0 +IF_SC3A_G2 = 0 +IF_SC3B_G2 = 0 +IF_SC3A_G3 = 0 +IF_SC3B_G3 = 0 +NPROMA = D%NGPTOT +LSCDERS = .FALSE. + +! Decide requirements + +IF(PRESENT(KVSETUV)) THEN + IF_UV_G = UBOUND(KVSETUV,1) + IF_UV_PAR = 2 + DO J=1,IF_UV_G + IF(KVSETUV(J) > NPRTRV .OR. KVSETUV(J) < 1) THEN + WRITE(NERR,*) 'INV_TRANS:KVSETUV(J) > NPRTRV ',J,KVSETUV(J),NPRTRV + CALL ABORT_TRANS('INV_TRANS:KVSETUV TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETUV(J) == MYSETV) THEN + IF_UV = IF_UV+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPVOR)) THEN + IF_UV = UBOUND(PSPVOR,1) + IF_UV_G = IF_UV + IF_UV_PAR = 2 +ENDIF + +IF(PRESENT(KVSETSC)) THEN + IF(.NOT. PRESENT(PSPSCALAR) ) THEN + CALL ABORT_TRANS('INV_TRANS : KVSETSC PRESENT BUT PSPSCALAR MISSING') + ENDIF + IF_SCALARS_G = UBOUND(KVSETSC,1) + DO J=1,IF_SCALARS_G + IF(KVSETSC(J) > NPRTRV .OR. KVSETSC(J) < 1) THEN + WRITE(NERR,*) 'INV_TRANS:KVSETSC(J) > NPRTRV ',J,KVSETSC(J),NPRTRV + CALL ABORT_TRANS('INV_TRANS:KVSETSC TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETSC(J) == MYSETV) THEN + IF_SCALARS = IF_SCALARS+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPSCALAR)) THEN + IF_SCALARS = UBOUND(PSPSCALAR,1) + IF_SCALARS_G = IF_SCALARS +ENDIF + +IF(PRESENT(KVSETSC2)) THEN + IF(.NOT.PRESENT(PSPSC2)) THEN + CALL ABORT_TRANS('INV_TRANS:KVSETSC2 BUT NOT PSPSC2') + ENDIF + IF_SC2_G = UBOUND(KVSETSC2,1) + IF_SCALARS_G = IF_SCALARS_G+UBOUND(KVSETSC2,1) + DO J=1,UBOUND(KVSETSC2,1) + IF(KVSETSC2(J) > NPRTRV .OR. KVSETSC2(J) < 1) THEN + WRITE(NERR,*) 'INV_TRANS:KVSETSC2(J) > NPRTRV ',J,KVSETSC2(J),NPRTRV + CALL ABORT_TRANS('INV_TRANS:KVSETSC2 TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETSC2(J) == MYSETV) THEN + IF_SCALARS = IF_SCALARS+1 + NF_SC2 = NF_SC2+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPSC2)) THEN + IF_SC2_G = UBOUND(PSPSC2,1) + IF_SCALARS = IF_SCALARS+UBOUND(PSPSC2,1) + IF_SCALARS_G = IF_SCALARS_G + IF_SC2_G + NF_SC2 = UBOUND(PSPSC2,1) +ENDIF + +IF(PRESENT(KVSETSC3A)) THEN + IF(.NOT.PRESENT(PSPSC3A)) THEN + CALL ABORT_TRANS('INV_TRANS:KVSETSC3A BUT NOT PSPSC3A') + ENDIF + IF_SC3A_G2 = UBOUND(KVSETSC3A,1) + IF_SC3A_G3 = UBOUND(PSPSC3A,3) + IF_SCALARS_G = IF_SCALARS_G+IF_SC3A_G2*IF_SC3A_G3 + DO J=1,UBOUND(KVSETSC3A,1) + IF(KVSETSC3A(J) > NPRTRV .OR. KVSETSC3A(J) < 1) THEN + WRITE(NERR,*) 'INV_TRANS:KVSETSC3A(J) > NPRTRV ',J,KVSETSC3A(J),NPRTRV + CALL ABORT_TRANS& + &('INV_TRANS:KVSETSC3A TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETSC3A(J) == MYSETV) THEN + IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3A,3) + NF_SC3A = NF_SC3A+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPSC3A)) THEN + IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3A,1)*UBOUND(PSPSC3A,3) + IF_SC3A_G2 = UBOUND(PSPSC3A,1) + IF_SC3A_G3 = UBOUND(PSPSC3A,3) + IF_SCALARS_G = IF_SCALARS_G + IF_SC3A_G2*IF_SC3A_G3 + NF_SC3A = UBOUND(PSPSC3A,1) +ENDIF + +IF(PRESENT(KVSETSC3B)) THEN + IF(.NOT.PRESENT(PSPSC3B)) THEN + CALL ABORT_TRANS('INV_TRANS:KVSETSC3B BUT NOT PSPSC3B') + ENDIF + IF_SC3B_G2 = UBOUND(KVSETSC3B,1) + IF_SC3B_G3 = UBOUND(PSPSC3B,3) + IF_SCALARS_G = IF_SCALARS_G+IF_SC3B_G2*IF_SC3B_G3 + DO J=1,UBOUND(KVSETSC3B,1) + IF(KVSETSC3B(J) > NPRTRV .OR. KVSETSC3B(J) < 1) THEN + WRITE(NERR,*) 'INV_TRANS:KVSETSC3B(J) > NPRTRV ',J,KVSETSC3B(J),NPRTRV + CALL ABORT_TRANS('INV_TRANS:KVSETSC3B TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETSC3B(J) == MYSETV) THEN + IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3B,3) + NF_SC3B = NF_SC3B+1 + ENDIF + ENDDO +ELSEIF(PRESENT(PSPSC3B)) THEN + IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3B,1)*UBOUND(PSPSC3B,3) + IF_SC3B_G2 = UBOUND(PSPSC3B,1) + IF_SC3B_G3 = UBOUND(PSPSC3B,3) + IF_SCALARS_G = IF_SCALARS_G + IF_SC3B_G2*IF_SC3B_G3 + NF_SC3B = UBOUND(PSPSC3B,1) +ENDIF + +IF (IF_SCALARS_G > 0 ) THEN + IF(PRESENT(LDSCDERS)) THEN + LSCDERS = LDSCDERS + IF (LSCDERS) IF_SCDERS = IF_SCALARS + ENDIF +ENDIF + +IF(PRESENT(KPROMA)) THEN + NPROMA = KPROMA +ENDIF + +IF(PRESENT(LDVORGP)) THEN + LVORGP = LDVORGP +ENDIF + +IF(PRESENT(LDDIVGP)) THEN + LDIVGP = LDDIVGP +ENDIF + +IF(PRESENT(LDUVDER)) THEN + LUVDER = LDUVDER +ENDIF + +IF(PRESENT(LDLATLON)) THEN + LATLON = LDLATLON +ENDIF + +! Compute derived variables + +NGPBLKS = (D%NGPTOT-1)/NPROMA+1 + +IF_OUT_LT = 2*IF_UV + IF_SCALARS+IF_SCDERS + +IF(IF_UV > 0 .AND. LVORGP) THEN + IF_OUT_LT = IF_OUT_LT+IF_UV +ENDIF +IF(IF_UV > 0 .AND. LDIVGP) THEN + IF_OUT_LT = IF_OUT_LT+IF_UV +ENDIF +IF_FS = IF_OUT_LT+IF_SCDERS +IF(IF_UV > 0 .AND. LUVDER) THEN + IF_FS = IF_FS+2*IF_UV +ENDIF + +IF_GP = 2*IF_UV_G+IF_SCALARS_G +IF(LSCDERS) THEN + IF_GP = IF_GP+2*IF_SCALARS_G + IF_SC2_G = IF_SC2_G*3 + IF_SC3A_G3 = IF_SC3A_G3*3 + IF_SC3B_G3 = IF_SC3B_G3*3 +ENDIF +IF(IF_UV_G > 0 .AND. LVORGP) THEN + IF_GP = IF_GP+IF_UV_G + IF_UV_PAR = IF_UV_PAR+1 +ENDIF +IF(IF_UV_G > 0 .AND. LDIVGP) THEN + IF_GP = IF_GP+IF_UV_G + IF_UV_PAR = IF_UV_PAR+1 +ENDIF +IF(IF_UV_G > 0 .AND. LUVDER) THEN + IF_GP = IF_GP+2*IF_UV_G + IF_UV_PAR = IF_UV_PAR+2 +ENDIF + +! Consistency checks + +IF (IF_UV > 0) THEN + IF(.NOT. PRESENT(PSPVOR) ) THEN + CALL ABORT_TRANS('INV_TRANS : IF_UV > 0 BUT PSPVOR MISSING') + ENDIF + IF(UBOUND(PSPVOR,1) /= IF_UV) THEN + WRITE(NERR,*)'INV_TRANS : UBOUND(PSPVOR,1) /= IF_UV ',UBOUND(PSPVOR,1),IF_UV + CALL ABORT_TRANS('INV_TRANS : PSPVOR TOO SHORT OR TOO LONG') + ENDIF + IF(.NOT. PRESENT(PSPDIV) ) THEN + CALL ABORT_TRANS('INV_TRANS : IF_UV > 0 BUT PSPDIV MISSING') + ENDIF + IF(UBOUND(PSPDIV,1) /= IF_UV) THEN + WRITE(NERR,*)'INV_TRANS : UBOUND(PSPDIV,1) /= IF_UV ',UBOUND(PSPDIV,1),IF_UV + CALL ABORT_TRANS('INV_TRANS : PSPDIV TOO SHORT OR TOO LONG') + ENDIF +ENDIF + +IF (IF_SCALARS > 0) THEN + IF(PRESENT(PSPSCALAR)) THEN + IF(PRESENT(PSPSC3A))THEN + CALL ABORT_TRANS('INV_TRANS : PSPSCALAR AND PSPSC3A BOTH PRESENT') + ENDIF + IF(PRESENT(PSPSC3B))THEN + CALL ABORT_TRANS('INV_TRANS : PSPSCALAR AND PSPSC3B BOTH PRESENT') + ENDIF + IF(PRESENT(PSPSC2))THEN + CALL ABORT_TRANS('INV_TRANS : PSPSCALAR AND PSPSC2 BOTH PRESENT') + ENDIF + IF(UBOUND(PSPSCALAR,1) /= IF_SCALARS) THEN + WRITE(NERR,*)'INV_TRANS : UBOUND(PSPSCALAR,1) /= IF_SCALARS) ',& + & UBOUND(PSPSCALAR,1),IF_SCALARS + CALL ABORT_TRANS('INV_TRANS : PSPSCALAR TOO SHORT OR TOO LONG') + ENDIF + ELSEIF(PRESENT(PSPSC3A)) THEN + ENDIF +ENDIF + +IF(IF_UV_G == 0) THEN + LUVDER = .FALSE. +ENDIF + +IF(NPRTRV >1) THEN + IF(IF_UV > 0 .AND. .NOT. PRESENT(KVSETUV)) THEN + WRITE(NERR,*)'NPRTRV >1 AND IF_UV > 0 AND NOT PRESENT(KVSETUV)',& + &NPRTRV,IF_UV + CALL ABORT_TRANS('INV_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF + IF(PRESENT(PSPSCALAR) .AND. .NOT. PRESENT(KVSETSC)) THEN + WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSCALAR) AND NOT PRESENT(KVSETSC)',& + &NPRTRV + CALL ABORT_TRANS('INV_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF + IF(PRESENT(PSPSC2) .AND. .NOT. PRESENT(KVSETSC2)) THEN + WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC2) AND NOT PRESENT(KVSETSC2)',& + &NPRTRV + CALL ABORT_TRANS('INV_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF + IF(PRESENT(PSPSC3A) .AND. .NOT. PRESENT(KVSETSC3A)) THEN + WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC3A) AND NOT PRESENT(KVSETSC3A)',& + &NPRTRV + CALL ABORT_TRANS('INV_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF + IF(PRESENT(PSPSC3B) .AND. .NOT. PRESENT(KVSETSC3B)) THEN + WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC3B) AND NOT PRESENT(KVSETSC3B)',& + &NPRTRV + CALL ABORT_TRANS('INV_TRANS: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF +ENDIF + +IF(PRESENT(PGP)) THEN + IF(PRESENT(PGPUV)) THEN + CALL ABORT_TRANS('INV_TRANS:PGP AND PGPUV CAN NOT BOTH BE PRESENT') + ENDIF + IF(PRESENT(PGP3A)) THEN + CALL ABORT_TRANS('INV_TRANS:PGP AND PGP3A CAN NOT BOTH BE PRESENT') + ENDIF + IF(PRESENT(PGP3B)) THEN + CALL ABORT_TRANS('INV_TRANS:PGP AND PGP3B CAN NOT BOTH BE PRESENT') + ENDIF + IF(PRESENT(PGP2)) THEN + CALL ABORT_TRANS('INV_TRANS:PGP AND PGP2 CAN NOT BOTH BE PRESENT') + ENDIF + IUBOUND(1:3)=UBOUND(PGP) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP TOO SMALL/LARGE ') + ENDIF + IF(IUBOUND(2) < IF_GP) THEN + WRITE(NOUT,*)'INV_TRANS:SEC. DIM. OF PGP TOO SMALL/LARGE ',IUBOUND(2),IF_GP + WRITE(NOUT,*)'IF_UV_G,IF_SCALARS_G,LSCDERS,LVORGP,LDIVGP,LUVDER ',& + & IF_UV_G,IF_SCALARS_G,LSCDERS,LVORGP,LDIVGP,LUVDER + CALL ABORT_TRANS('INV_TRANS:SECOND DIMENSION OF PGP TOO SMALL ') + ENDIF + IF(IUBOUND(3) < NGPBLKS) THEN + WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGP TOO SMALL/LARGE ',IUBOUND(3),NGPBLKS + CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP TOO SMALL/LARGE ') + ENDIF +ELSE + IF(NPROMATR > 0 .AND. 2*IF_UV_G+IF_SCALARS_G > NPROMATR) THEN + CALL ABORT_TRANS('INV_TRANS:ALTERNATIVES TO USING PGP NOT SUPPORTED WITH NPROMATR>0') + ENDIF +ENDIF + +IF(PRESENT(PGPUV)) THEN + IF(.NOT.PRESENT(PSPVOR)) THEN + CALL ABORT_TRANS('INV_TRANS:PSPVOR HAS TO BE PRESENT WHEN PGPUV IS') + ENDIF + IUBOUND(1:4)=UBOUND(PGPUV) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGPUV TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGPUV TOO SMALL/LARGE ') + ENDIF + IF(IUBOUND(2) < IF_UV_G) THEN + WRITE(NOUT,*)'INV_TRANS:SEC. DIM. OF PGPUV INCONSISTENT ',IUBOUND(2),IF_UV_G + CALL ABORT_TRANS('INV_TRANS:SEC. DIMENSION OF PGPUV INCONSISTENT ') + ENDIF + IF(IUBOUND(3) < IF_UV_PAR) THEN + WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGPUV TOO SMALL/LARGE ',IUBOUND(3),IF_UV_PAR + CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGPUV TOO SMALL/LARGE ') + ENDIF + IF(IUBOUND(4) < NGPBLKS) THEN + WRITE(NOUT,*)'INV_TRANS:FOURTH DIM. OF PGPUV TOO SMALL/LARGE ',IUBOUND(4),NGPBLKS + CALL ABORT_TRANS('INV_TRANS:FOURTH DIMENSION OF PGPUV TOO SMALL/LARGE ') + ENDIF +ENDIF +IF(PRESENT(PGP2)) THEN + IF(.NOT.PRESENT(PSPSC2)) THEN + CALL ABORT_TRANS('INV_TRANS:PSPSC2 HAS TO BE PRESENT WHEN PGP2 IS') + ENDIF +ENDIF +IF(IF_SC2_G > 0) THEN + IF(PRESENT(PGP2)) THEN + IUBOUND(1:3)=UBOUND(PGP2) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP2 TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP2 TOO SMALL/LARGE ') + ENDIF + IF(IUBOUND(2) /= IF_SC2_G) THEN + WRITE(NOUT,*)'INV_TRANS:SEC. DIM. OF PGP2 INCONSISTENT ',IUBOUND(2),IF_SC2_G + CALL ABORT_TRANS('INV_TRANS:SEC. DIMENSION OF PGP2 INCONSISTENT') + ENDIF + IF(IUBOUND(3) < NGPBLKS) THEN + WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGP2 TOO SMALL/LARGE ',IUBOUND(3),NGPBLKS + CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP2 TOO SMALL/LARGE ') + ENDIF + ELSE + CALL ABORT_TRANS('INV_TRANS:PGP2 MISSING') + ENDIF +ENDIF + +IF(PRESENT(PGP3A)) THEN + IF(.NOT.PRESENT(PSPSC3A)) THEN + CALL ABORT_TRANS('INV_TRANS:PSPSC3A HAS TO BE PRESENT WHEN PGP3A IS') + ENDIF +ENDIF +IF(IF_SC3A_G3 > 0) THEN + IF(PRESENT(PGP3A)) THEN + IUBOUND=UBOUND(PGP3A) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP3A TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP3A TOO SMALL/LARGE ') + ENDIF + IF(IUBOUND(2) /= IF_SC3A_G2) THEN + WRITE(NOUT,*)'INV_TRANS:SEC DIM. OF PGP3A INCONSISTENT ',IUBOUND(2),IF_SC3A_G2 + CALL ABORT_TRANS('INV_TRANS:SEC. DIMENSION OF PGP3A INCONSISTENT ') + ENDIF + IF(IUBOUND(3) /= IF_SC3A_G3 ) THEN + WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGP3A INCONSISTENT ',& + & IUBOUND(3),IF_SC3A_G3 + CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP3A INCONSISTENT ') + ENDIF + IF(IUBOUND(4) < NGPBLKS) THEN + WRITE(NOUT,*)'INV_TRANS:FOURTH DIM. OF PGP3A TOO SMALL/LARGE ',IUBOUND(4),NGPBLKS + CALL ABORT_TRANS('INV_TRANS:FOURTH DIMENSION OF PGP3A TOO SMALL/LARGE ') + ENDIF + ELSE + CALL ABORT_TRANS('INV_TRANS:PGP3A MISSING') + ENDIF +ENDIF + +IF(PRESENT(PGP3B)) THEN + IF(.NOT.PRESENT(PSPSC3B)) THEN + CALL ABORT_TRANS('INV_TRANS:PSPSC3B HAS TO BE PRESENT WHEN PGP3B IS') + ENDIF +ENDIF +IF(IF_SC3B_G3 > 0) THEN + IF(PRESENT(PGP3B)) THEN + IUBOUND=UBOUND(PGP3B) + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP3B TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP3B TOO SMALL/LARGE ') + ENDIF + IF(IUBOUND(2) /= IF_SC3B_G2) THEN + WRITE(NOUT,*)'INV_TRANS:SEC DIM. OF PGP3B INCONSISTENT ',IUBOUND(2),IF_SC3B_G2 + CALL ABORT_TRANS('INV_TRANS:SEC. DIMENSION OF PGP3B INCONSISTENT ') + ENDIF + IF(IUBOUND(3) /= IF_SC3B_G3 ) THEN + WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGP3B INCONSISTENT ',& + & IUBOUND(3),IF_SC3B_G3 + CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP3B INCONSISTENT ') + ENDIF + IF(IUBOUND(1) < NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP3B TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP3B TOO SMALL/LARGE ') + ENDIF + ELSE + CALL ABORT_TRANS('INV_TRANS:PGP3B MISSING') + ENDIF +ENDIF +CALL GSTATS(1807,1) + +! ------------------------------------------------------------------ + +! Perform transform + +CALL INV_TRANS_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,IF_OUT_LT,& + & IF_UV,IF_SCALARS,IF_SCDERS,& + & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,FSPGL_PROC,& + & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) + +IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) +ENDIF +CALL GSTATS(420,1) + +IF (LHOOK) CALL DR_HOOK('INV_TRANS',1,ZHOOK_HANDLE) +! ------------------------------------------------------------------ + +!endif INTERFACE + +END SUBROUTINE INV_TRANS diff --git a/src/trans/gpu/external/inv_transad.F90 b/src/trans/gpu/external/inv_transad.F90 new file mode 100755 index 00000000..0635b20f --- /dev/null +++ b/src/trans/gpu/external/inv_transad.F90 @@ -0,0 +1,166 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE INV_TRANSAD(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& + & FSPGL_PROC,& + & LDSCDERS,LDVORGP,LDDIVGP,LDUVDER,KPROMA,KVSETUV,KVSETSC,KRESOL,& + & KVSETSC3A,KVSETSC3B,KVSETSC2,& + & PGP,PGPUV,PGP3A,PGP3B,PGP2) + +!**** *INV_TRANSAD* - Inverse spectral transform - adjoint. + +! Purpose. +! -------- +! Interface routine for the inverse spectral transform - adjoint + +!** Interface. +! ---------- +! CALL INV_TRANSAD(...) + +! Explicit arguments : All arguments except from PGP are optional. +! -------------------- +! PSPVOR(:,:) - spectral vorticity (input) +! PSPDIV(:,:) - spectral divergence (input) +! PSPSCALAR(:,:) - spectral scalarvalued fields (input) +! PSPSC3A(:,:,:) - alternative to use of PSPSCALAR, see PGP3A below (input) +! PSPSC3B(:,:,:) - alternative to use of PSPSCALAR, see PGP3B below (input) +! PSPSC2(:,:) - alternative to use of PSPSCALAR, see PGP2 below (input) +! FSPGL_PROC - external procedure to be executed in fourier space +! before transposition +! LDSCDERS - indicating if derivatives of scalar variables are req. +! LDVORGP - indicating if grid-point vorticity is req. +! LDDIVGP - indicating if grid-point divergence is req. +! LDUVDER - indicating if E-W derivatives of u and v are req. +! KPROMA - required blocking factor for gridpoint output +! KVSETUV(:) - indicating which 'b-set' in spectral space owns a +! vor/div field. Equivalant to NBSETLEV in the IFS. +! The length of KVSETUV should be the GLOBAL number +! of u/v fields which is the dimension of u and v releated +! fields in grid-point space. +! KVESETSC(:) - indicating which 'b-set' in spectral space owns a +! scalar field. As for KVSETUV this argument is required +! if the total number of processors is greater than +! the number of processors used for distribution in +! spectral wave space. +! KVSETSC3A(:) - as KVESETSC for PSPSC3A (distribution on first dimension) +! KVSETSC3B(:) - as KVESETSC for PSPSC3B (distribution on first dimension) +! KVSETSC2(:) - as KVESETSC for PSPSC2 (distribution on first dimension) +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PGP(:,:,:) - gridpoint fields (output) +! PGP need to dimensioned (NPROMA,IF_GP,NGPBLKS) where +! NPROMA is the blocking factor, IF_GP the total number +! of output fields and NGPBLKS the number of NPROMA blocks. +! The ordering of the output fields is as follows (all +! parts are optional depending on the input switches): +! +! vorticity : IF_UV_G fields (if psvor present and LDVORGP) +! divergence : IF_UV_G fields (if psvor present and LDDIVGP) +! u : IF_UV_G fields (if psvor present) +! v : IF_UV_G fields (if psvor present) +! scalar fields : IF_SCALARS_G fields (if pspscalar present) +! N-S derivative of scalar fields : IF_SCALARS_G fields (if pspscalar +! present and LDSCDERS) +! E-W derivative of u : IF_UV_G fields (if psvor present and and LDUVDER) +! E-W derivative of v : IF_UV_G fields (if psvor present and and LDUVDER) +! E-W derivative of scalar fields : IF_SCALARS_G fields (if pspscalar +! present and LDSCDERS) +! +! Here IF_UV_G is the GLOBAL number of u/v fields as given by the length +! of KVSETUV (or by PSPVOR if no split in spectral 'b-set' direction +! IF_SCALARS_G is the GLOBAL number of scalar fields as giben by the +! length of KVESETSC (or by number of fields in PSPSCALAR if no spectral +! 'b-set' split + +! As an alternative to using PGP you can also use a combination of the +! following arrays. The reason for introducing these alternative ways +! of calling INV_TRANS is to avoid uneccessary copies where your data +! structures don't fit in to the 'PSPVOR,PSPDIV, PSPSCALAR, PGP' layout. +! The use of any of these precludes the use of PGP and vice versa. +! +! PGPUV(:,:,:,:) - the 'u-v' related grid-point variables in the order +! described for PGP. The second dimension of PGPUV should +! be the same as the "global" first dimension of +! PSPVOR,PSPDIV (in the IFS this is the number of levels) +! PGPUV need to be dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (u,v,vor,div ...) +! PGP3A(:,:,:,:) - grid-point array directly connected with PSPSC3A +! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC3A if no derivatives, 3 times that with der.) +! PGP3B(:,:,:,:) - grid-point array directly connected with PSPSC3B +! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC3B if no derivatives, 3 times that with der.) +! PGP2(:,:,:) - grid-point array directly connected with PSPSC2 +! dimensioned(NPROMA,IFLDS,NGPBLKS) +! IFLDS is the number of 'variables' (the same as in +! PSPSC2 if no derivatives, 3 times that with der.) + +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- LTDIR_CTLAD - control of Legendre transform +! FTDIR_CTLAD - control of Fourier transform + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3A(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3B(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) +LOGICAL ,OPTIONAL, INTENT(IN) :: LDSCDERS +LOGICAL ,OPTIONAL, INTENT(IN) :: LDVORGP +LOGICAL ,OPTIONAL, INTENT(IN) :: LDDIVGP +LOGICAL ,OPTIONAL, INTENT(IN) :: LDUVDER +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +EXTERNAL FSPGL_PROC +OPTIONAL FSPGL_PROC +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP(:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGPUV(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP3A(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP3B(:,:,:,:) +REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP2(:,:,:) + +!ifndef INTERFACE + +CALL ABORT_TRANS("INV_TRANSAD: NOT SUPPORTED FOR GPU") + +END SUBROUTINE INV_TRANSAD + diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 new file mode 100755 index 00000000..c9a0055f --- /dev/null +++ b/src/trans/gpu/external/setup_trans.F90 @@ -0,0 +1,705 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& + & KTMAX,KRESOL,PWEIGHT,LDGRIDONLY,LDUSERPNM,LDKEEPRPNM,LDUSEFLT,& + & LDSPSETUPONLY,LDPNMONLY,LDUSEFFTW,& + & LDLL,LDSHIFTLL,CDIO_LEGPOL,CDLEGPOLFNAME,KLEGPOLPTR,KLEGPOLPTR_LEN) + +!**** *SETUP_TRANS* - Setup transform package for specific resolution + +! Purpose. +! -------- +! To setup for making spectral transforms. Each call to this routine +! creates a new resolution up to a maximum of NMAX_RESOL set up in +! SETUP_TRANS0. You need to call SETUP_TRANS0 before this routine can +! be called. + +!** Interface. +! ---------- +! CALL SETUP_TRANS(...) + +! Explicit arguments : KLOEN,LDSPLIT are optional arguments +! -------------------- +! KSMAX - spectral truncation required +! KDGL - number of Gaussian latitudes +! KDLON - number of points on each Gaussian latitude [2*KDGL] +! KLOEN(:) - number of points on each Gaussian latitude [2*KDGL] +! LDSPLIT - true if split latitudes in grid-point space [false] +! KTMAX - truncation order for tendencies? +! KRESOL - the resolution identifier +! PWEIGHT - the weight per grid-point (for a weighted distribution) +! LDGRIDONLY - true if only grid space is required + +! KSMAX,KDGL,KTMAX and KLOEN are GLOBAL variables desribing the resolution +! in spectral and grid-point space + +! LDSPLIT describe the distribution among processors of grid-point data and +! has no relevance if you are using a single processor + +! PSTRET - stretching factor - for the case the Legendre polynomials are +! computed on the stretched sphere - works with LSOUTHPNM +! LDUSEFLT - use Fast Legandre Transform (Butterfly algorithm) +! LDUSERPNM - Use Belusov algorithm to compute legendre pol. (else new alg.) +! LDKEEPRPNM - Keep Legendre Polynomials (only applicable when using +! FLT, otherwise always kept) +! LDPNMONLY - Compute the Legendre polynomials only, not the FFTs. +! LDUSEFFTW - Use FFTW for FFTs +! LDLL - Setup second set of input/output latitudes +! the number of input/output latitudes to transform is equal KDGL +! or KDGL+2 in the case that includes poles + equator +! the number of input/output longitudes to transform is 2*KDGL +! LDSHIFTLL - Shift output lon/lat data by 0.5*dx and 0.5*dy +! CDIO_LEGPOL - IO option on Legendre polinomials : N.B. Only works for NPROC=1 +! Options: +! 'READF' - read Leg.Pol. from file CDLEGPOLFNAME +! 'WRITEF' - write Leg.Pol. to file CDLEGPOLFNAME +! 'MEMBUF' - Leg. Pol provided in shared memory segment pointed to by KLEGPOLPTR of +! length KLEGPOLPTR_LEN +! CDLEGPOLFNAME - file name for Leg.Pol. IO +! KLEGPOLPTR - pointer to Legendre polynomials memory segment +! KLEGPOLPTR_LEN - length of Legendre polynomials memory segment + +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- SETUP_DIMS - setup distribution independent dimensions +! SUMP_TRANS_PRELEG - first part of setup of distr. environment +! SULEG - Compute Legandre polonomial and Gaussian +! Latitudes and Weights +! SUMP_TRANS - Second part of setup of distributed environment +! SUFFT - setup for FFT +! SHAREDMEM_CREATE - create memory buffer for Leg.pol. + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 +! Daan Degrauwe : Mar 2012 E'-zone dimensions +! R. El Khatib 09-Aug-2012 %LAM in GEOM_TYPE +! R. El Khatib 14-Jun-2013 PSTRET, LDPNMONLY, LENABLED +! G. Mozdzynski : Oct 2014 Support f +! N. Wedi : Apr 2015 Support dual set of lat/lon +! G. Mozdzynski : Jun 2015 Support alternative FFTs to FFTW +! M.Hamrud/W.Deconinck : July 2015 IO options for Legenndre polynomials +! R. El Khatib 07-Mar-2016 Better flexibility for Legendre polynomials computation in stretched mode +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB , JPRD +USE PARKIND_ECTRANS ,ONLY : JPRBT +USE EC_ENV_MOD ,ONLY : EC_GETENV +USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT,C_ASSOCIATED,C_SIZE_T + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NOUT, MSETUP0, NCUR_RESOL, NDEF_RESOL, & + & NMAX_RESOL, NPRINTLEV, LENABLED, NERR +USE TPM_DIM ,ONLY : R, DIM_RESOL, R_NSMAX,R_NTMAX, R_NDGNH, R_NDGL +USE TPM_DISTR ,ONLY : D, DISTR_RESOL,NPROC,NPRTRV, D_NUMP,D_NDGL_FS,D_MYMS,D_NSTAGT0B,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1, D_NASM0, & +& D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,MYSETV,MYSETW, MYPROC,D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 +USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX +USE TPM_FIELDS ,ONLY : FIELDS_RESOL, F,F_RW, F_RN, F_RLAPIN, F_RACTHE, ZEPSNM, & +& ZAA,ZAS,& +& ZAA0,ZAS0,KMLOC0 +! IZBA,IZCAT +USE TPM_FFT ,ONLY : T, FFT_RESOL +USE TPM_HICFFT ,ONLY : HICT, HICFFT_RESOL + +#ifdef WITH_FFTW +USE TPM_FFTW ,ONLY : TW, FFTW_RESOL +#endif +USE TPM_FLT +USE TPM_CTL + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE SETUP_DIMS_MOD ,ONLY : SETUP_DIMS +USE SUMP_TRANS_MOD ,ONLY : SUMP_TRANS +USE SUMP_TRANS_PRELEG_MOD ,ONLY : SUMP_TRANS_PRELEG +USE SULEG_MOD ,ONLY : SULEG +USE PRE_SULEG_MOD ,ONLY : PRE_SULEG +USE SUFFT_MOD ,ONLY : SUFFT +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE SHAREDMEM_MOD ,ONLY : SHAREDMEM_CREATE +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +!USE CUDA_DEVICE_MOD ! only for NVIDIA GPUs +USE PREPSNM_MOD ,ONLY : PREPSNM +#ifdef ACCGPU +USE OPENACC +#endif +#ifdef OMPGPU +USE OMP_LIB +#endif + +!endif INTERFACE + +IMPLICIT NONE + +! Dummy arguments + +INTEGER(KIND=JPIM) ,INTENT(IN) :: KSMAX,KDGL +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KDLON +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KLOEN(:) +LOGICAL ,OPTIONAL,INTENT(IN) :: LDSPLIT +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KTMAX +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT):: KRESOL +REAL(KIND=JPRD) ,OPTIONAL,INTENT(IN) :: PWEIGHT(:) +REAL(KIND=JPRD) ,OPTIONAL,INTENT(IN) :: PSTRET +LOGICAL ,OPTIONAL,INTENT(IN):: LDGRIDONLY +LOGICAL ,OPTIONAL,INTENT(IN):: LDUSEFLT +LOGICAL ,OPTIONAL,INTENT(IN):: LDUSERPNM +LOGICAL ,OPTIONAL,INTENT(IN):: LDKEEPRPNM +LOGICAL ,OPTIONAL,INTENT(IN):: LDSPSETUPONLY +LOGICAL ,OPTIONAL,INTENT(IN):: LDPNMONLY +LOGICAL ,OPTIONAL,INTENT(IN):: LDUSEFFTW +LOGICAL ,OPTIONAL,INTENT(IN):: LDLL +LOGICAL ,OPTIONAL,INTENT(IN):: LDSHIFTLL +CHARACTER(LEN=*),OPTIONAL,INTENT(IN):: CDIO_LEGPOL +CHARACTER(LEN=*),OPTIONAL,INTENT(IN):: CDLEGPOLFNAME +TYPE(C_PTR) ,OPTIONAL,INTENT(IN) :: KLEGPOLPTR +INTEGER(C_SIZE_T) ,OPTIONAL,INTENT(IN) :: KLEGPOLPTR_LEN + +!ifndef INTERFACE + +! Local variables +INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL +INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, I, J + +INTEGER(KIND=JPIM) :: IPROC, IPROCS, ISTAN, ISTAS, ISL, IGLS, JFLD + +LOGICAL :: LLP1,LLP2, LLSPSETUPONLY +REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2 +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +CHARACTER(LEN=8) :: CENV + +#ifdef ACCGPU +INTEGER(ACC_DEVICE_KIND) :: IDEVTYPE +#endif +INTEGER :: INUMDEVS, IUNIT, ISTAT, IDEV, MYGPU + +#include "user_clock.intfb.h" +! ------------------------------------------------------------------ + +IF (LHOOK) CALL DR_HOOK('SETUP_TRANS',0,ZHOOK_HANDLE) + +IF(MSETUP0 == 0) THEN + CALL ABORT_TRANS('SETUP_TRANS: SETUP_TRANS0 HAS TO BE CALLED BEFORE SETUP_TRANS') +ENDIF +LLP1 = NPRINTLEV>0 +LLP2 = NPRINTLEV>1 +IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SETUP_TRANS ===' + +! Allocate resolution dependent structures +IF(.NOT. ALLOCATED(DIM_RESOL)) THEN + IDEF_RESOL = 1 + ALLOCATE(DIM_RESOL(NMAX_RESOL)) + ALLOCATE(FIELDS_RESOL(NMAX_RESOL)) + ALLOCATE(GEOM_RESOL(NMAX_RESOL)) + ALLOCATE(DISTR_RESOL(NMAX_RESOL)) + ALLOCATE(FFT_RESOL(NMAX_RESOL)) + ALLOCATE(HICFFT_RESOL(NMAX_RESOL)) + ALLOCATE(FLT_RESOL(NMAX_RESOL)) + ALLOCATE(CTL_RESOL(NMAX_RESOL)) + GEOM_RESOL(:)%LAM=.FALSE. + ALLOCATE(LENABLED(NMAX_RESOL)) + LENABLED(:)=.FALSE. +ELSE + IDEF_RESOL = NMAX_RESOL+1 + DO JRES=1,NMAX_RESOL + IF(.NOT.LENABLED(JRES)) THEN + IDEF_RESOL = JRES + EXIT + ENDIF + ENDDO + IF(IDEF_RESOL > NMAX_RESOL) THEN + CALL ABORT_TRANS('SETUP_TRANS:IDEF_RESOL > NMAX_RESOL') + ENDIF +ENDIF + +IF (PRESENT(KRESOL)) THEN + KRESOL=IDEF_RESOL +ENDIF + +! Point at structures due to be initialized +CALL SET_RESOL(IDEF_RESOL,LDSETUP=.TRUE.) + +IF(LLP1) WRITE(NOUT,*) '=== DEFINING RESOLUTION ',NCUR_RESOL + + + +! Defaults for optional arguments + + +G%LREDUCED_GRID = .FALSE. +G%RSTRET=1.0_JPRBT +D%LGRIDONLY = .FALSE. +D%LSPLIT = .FALSE. +D%LCPNMONLY=.FALSE. +S%LUSE_BELUSOV=.TRUE. ! use Belusov algorithm to compute RPNM array instead of per m +S%LKEEPRPNM=.FALSE. ! Keep Legendre polonomials (RPNM) +LLSPSETUPONLY = .FALSE. ! Only create distributed spectral setup +S%LDLL = .FALSE. ! use mapping to/from second set of latitudes +S%LSHIFTLL = .FALSE. ! shift output lat-lon by 0.5dx, 0.5dy +C%LREAD_LEGPOL = .FALSE. +C%LWRITE_LEGPOL = .FALSE. + + +! NON-OPTIONAL ARGUMENTS +R%NSMAX = KSMAX +R%NDGL = KDGL +! E'-defaults +R%NNOEXTZL=0 +R%NNOEXTZG=0 + +! IMPLICIT argument : +G%LAM = .FALSE. + +IF(PRESENT(KDLON)) THEN + R%NDLON = KDLON +ELSE + R%NDLON = 2*R%NDGL +ENDIF + +IF(PRESENT(LDLL)) THEN + S%LDLL=LDLL + IF( LDLL ) THEN + S%NDLON=R%NDLON + ! account for pole + equator + R%NDGL=R%NDGL+2 + IF(PRESENT(LDSHIFTLL)) THEN + S%LSHIFTLL = LDSHIFTLL + ! geophysical (shifted) lat-lon without pole and equator + IF(S%LSHIFTLL) R%NDGL=R%NDGL-2 + ENDIF + S%NDGL=R%NDGL + ENDIF +ENDIF + +IF (R%NDGL <= 0 .OR. MOD(R%NDGL,2) /= 0) THEN + CALL ABORT_TRANS ('SETUP_TRANS: KDGL IS NOT A POSITIVE, EVEN NUMBER') +ENDIF + +! Optional arguments + +ALLOCATE(G%NLOEN(R%NDGL)) +IF(LLP2)WRITE(NOUT,9) 'NLOEN ',SIZE(G%NLOEN ),SHAPE(G%NLOEN ) +IF(PRESENT(KLOEN)) THEN + IF( MINVAL(KLOEN(:)) <= 0 )THEN + CALL ABORT_TRANS ('SETUP_TRANS: KLOEN INVALID (ONE or MORE POINTS <= 0)') + ENDIF + R%NDLON=MAXVAL(KLOEN(:)) + DO JGL=1,R%NDGL + IF(KLOEN(JGL) /= R%NDLON) THEN + G%LREDUCED_GRID = .TRUE. + EXIT + ENDIF + ENDDO +ENDIF + +IF (G%LREDUCED_GRID) THEN + G%NLOEN(:) = KLOEN(1:R%NDGL) +ELSE + G%NLOEN(:) = R%NDLON +ENDIF + +IF(PRESENT(LDSPLIT)) THEN + D%LSPLIT = LDSPLIT +ENDIF + +IF(PRESENT(KTMAX)) THEN + R%NTMAX = KTMAX +ELSE + R%NTMAX = R%NSMAX +ENDIF + +IF(PRESENT(PWEIGHT)) THEN + D%LWEIGHTED_DISTR = .TRUE. + IF( D%LWEIGHTED_DISTR .AND. .NOT.D%LSPLIT )THEN + CALL ABORT_TRANS('SETUP_TRANS: LWEIGHTED_DISTR=T AND LSPLIT=F NOT SUPPORTED') + ENDIF + IF(SIZE(PWEIGHT) /= SUM(G%NLOEN(:)) )THEN + CALL ABORT_TRANS('SETUP_TRANS:SIZE(PWEIGHT) /= SUM(G%NLOEN(:))') + ENDIF + IF( MINVAL(PWEIGHT(:)) < 0.0_JPRBT )THEN + CALL ABORT_TRANS('SETUP_TRANS: INVALID WEIGHTS') + ENDIF + ALLOCATE(D%RWEIGHT(SIZE(PWEIGHT))) + D%RWEIGHT(:)=PWEIGHT(:) +ELSE + D%LWEIGHTED_DISTR = .FALSE. +ENDIF + +IF(PRESENT(LDGRIDONLY)) THEN + D%LGRIDONLY=LDGRIDONLY +ENDIF + +IF(PRESENT(LDSPSETUPONLY)) THEN + LLSPSETUPONLY=LDSPSETUPONLY +ENDIF + +IF(PRESENT(LDPNMONLY)) THEN + D%LCPNMONLY=LDPNMONLY +ENDIF + + +S%LSOUTHPNM=.FALSE. +IF(PRESENT(PSTRET)) THEN + IF (ABS(PSTRET-1.0_JPRBT)>100._JPRBT*EPSILON(1._JPRBT)) THEN + G%RSTRET=PSTRET + S%LSOUTHPNM=.TRUE. + ENDIF +ENDIF + +IF(PRESENT(CDIO_LEGPOL)) THEN + IF(NPROC > 1) CALL ABORT_TRANS('SETUP_TRANS:CDIO_LEGPOL OPTIONS ONLY FOR NPROC=1 ') + IF(TRIM(CDIO_LEGPOL) == 'readf' .OR. TRIM(CDIO_LEGPOL) == 'READF' ) THEN + IF(.NOT.PRESENT(CDLEGPOLFNAME)) CALL ABORT_TRANS('SETUP_TRANS: CDLEGPOLFNAME ARGUMENT MISSING') + C%LREAD_LEGPOL = .TRUE. + C%CLEGPOLFNAME = TRIM(CDLEGPOLFNAME) + C%CIO_TYPE='file' + ELSEIF(TRIM(CDIO_LEGPOL) == 'writef' .OR. TRIM(CDIO_LEGPOL) == 'WRITEF') THEN + IF(.NOT.PRESENT(CDLEGPOLFNAME)) CALL ABORT_TRANS('SETUP_TRANS: CDLEGPOLFNAME ARGUMENT MISSING') + C%LWRITE_LEGPOL = .TRUE. + C%CLEGPOLFNAME = TRIM(CDLEGPOLFNAME) + C%CIO_TYPE='file' + ELSEIF(TRIM(CDIO_LEGPOL) == 'membuf' .OR. TRIM(CDIO_LEGPOL) == 'MEMBUF') THEN + IF(.NOT.PRESENT(KLEGPOLPTR)) CALL ABORT_TRANS('SETUP_TRANS: KLEGPOLPTR ARGUMENT MISSING') + IF(.NOT.C_ASSOCIATED(KLEGPOLPTR)) CALL ABORT_TRANS('SETUP_TRANS: KLEGPOLPTR NULL POINTER') + IF(.NOT.PRESENT(KLEGPOLPTR_LEN)) CALL ABORT_TRANS('SETUP_TRANS: KLEGPOLPTR_LEN ARGUMENT MISSING') + C%LREAD_LEGPOL = .TRUE. + C%CIO_TYPE='mbuf' + CALL SHAREDMEM_CREATE( C%STORAGE,KLEGPOLPTR,KLEGPOLPTR_LEN) + ELSE + WRITE(NERR,*) 'CDIO_LEGPOL ', TRIM(CDIO_LEGPOL) + CALL ABORT_TRANS('SETUP_TRANS:CDIO_LEGPOL UNKNOWN METHOD ') + ENDIF +ENDIF + +IF(PRESENT(LDUSEFLT)) THEN + IF (LDUSEFLT) THEN + CALL ABORT_TRANS('SETUP_TRANS: LDUSEFLT option is not supported for GPU') + ENDIF +ENDIF +IF(PRESENT(LDUSERPNM)) THEN + S%LUSE_BELUSOV=LDUSERPNM +ENDIF +IF(PRESENT(LDKEEPRPNM)) THEN + S%LKEEPRPNM=LDKEEPRPNM +ENDIF +! Setup resolution dependent structures +! ------------------------------------- + +! Setup distribution independent dimensions +CALL SETUP_DIMS + +! First part of setup of distributed environment +CALL SUMP_TRANS_PRELEG + +IF( .NOT.LLSPSETUPONLY ) THEN + +! Compute Legendre polonomial and Gaussian Latitudes and Weights + CALL SULEG + +! Second part of setup of distributed environment + CALL SUMP_TRANS + CALL GSTATS(1802,0) + +! Initialize Fast Fourier Transform package + IF (.NOT.D%LCPNMONLY) CALL SUFFT + CALL GSTATS(1802,1) +ELSE + CALL PRE_SULEG +ENDIF + +! Signal the current resolution is active +LENABLED(IDEF_RESOL)=.TRUE. +NDEF_RESOL = COUNT(LENABLED) + +IF (LHOOK) CALL DR_HOOK('SETUP_TRANS',1,ZHOOK_HANDLE) +! ------------------------------------------------------------------ +9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) + +IF( .NOT.D%LGRIDONLY ) THEN + +!allocating arrays for the GPU: +!! CALL EC_GETENV("ECTRANS_GPU_NFLEV",CENV) +!! IF(LEN_TRIM(CENV)>0) THEN +!! WRITE(NOUT,'(2A)') "Using temporary solution for buffer allocation using ${ECTRANS_GPU_NFLEV}=",CENV +!! READ(CENV,*) NFLEV0 +!! ELSE +!! NFLEV0 = ceiling(REAL(IMAXFLD)/NPRTRV) +!! ENDIF + +IUNIT=300+MYPROC + +#ifdef ACCGPU +!!IDEVTYPE=ACC_DEVICE_NVIDIA +IDEVTYPE=ACC_GET_DEVICE_TYPE() +INUMDEVS = ACC_GET_NUM_DEVICES(IDEVTYPE) +MYGPU = MOD(MYPROC-1,INUMDEVS) +CALL ACC_SET_DEVICE_NUM(MYGPU, IDEVTYPE) +MYGPU = ACC_GET_DEVICE_NUM(IDEVTYPE) +!ISTAT = CUDA_GETDEVICE(IDEV) +#endif + +print*,'R%NTMAX=',R%NTMAX +print*,'R%NSMAX=',R%NSMAX + +#ifdef ACCGPU +!$ACC ENTER DATA COPYIN(F,S,D,R,G) +!$ACC ENTER DATA & +!$ACC& COPYIN(F%RN,F%RLAPIN) & +!$ACC& COPYIN(S%FA,S%ITHRESHOLD) & +!$ACC& COPYIN(D%NUMP,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM,D%NPTRLS,D%MSTABF) & +!$ACC& COPYIN(R%NDGNH,R%NSMAX) & +!$ACC& COPYIN(G%NDGLU,G%NMEN,G%NLOEN) + +#endif +#ifdef OMPGPU +!$OMP TARGET ENTER DATA MAP(ALLOC:ZAA,ZAS) +!$OMP TARGET ENTER DATA MAP(TO:F,S,D,D_NUMP,D_MYMS,R,R_NDGNH,R_NSMAX,G,G_NDGLU) +!$OMP TARGET ENTER DATA MAP(TO:D_NPNTGTB0,D_NPNTGTB1,D_NSTAGT0B,D_NSTAGT1B,D_NSTAGTF,G_NMEN,D_NPROCM,D_NPTRLS,G,G_NLOEN,D_MSTABF) +#endif + +! Initialize A arrays + +ALLOCATE(ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) +ALLOCATE(ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) + +WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP +WRITE(NOUT,*)'ZAS:',size(ZAS) +WRITE(NOUT,*)'ZAA:',size(ZAA) + +ZAA(:,:,:) = 0._JPRBT +ZAS(:,:,:) = 0._JPRBT + +DO JMLOC=1,D%NUMP + KM = D%MYMS(JMLOC) + KDGLU = G%NDGLU(KM) + ILA = (R%NSMAX-KM+2)/2 + ILS = (R%NSMAX-KM+3)/2 + + ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) +ENDDO + +! permanent copy of Legendre polynomials into device + +#ifdef ACCGPU +!$ACC ENTER DATA COPYIN(ZAA,ZAS) +#endif +#ifdef OMPGPU +#endif + +ALLOCATE(ZEPSNM(D%NUMP,0:R%NTMAX+2)) +WRITE(NOUT,*)'ZEPSNM :',SIZE(ZEPSNM) + +ZEPSNM = 0._JPRBT +! on the host +CALL PREPSNM +#ifdef ACCGPU +!$ACC ENTER DATA COPYIN(ZEPSNM) +#endif + +! TODO: I guess tose might be needed again +! add arrays for GPNORM1 +!ALLOCATE(ZAVE(IF_FS,R%NDGL)) +!ALLOCATE(ZMINGL(IF_FS,R%NDGL)) +!ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) +!ALLOCATE(ZMINGPN(IF_FS)) +!ALLOCATE(ZMAXGPN(IF_FS)) +! +!ZAVE = 0._JPRBT +!ZMINGL = 0._JPRBT +!ZMAXGL = 0._JPRBT +!ZMINGPN = 0._JPRBT +!ZMAXGPN = 0._JPRBT +!#ifdef ACCGPU +!!$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) +!#endif + +!set up flat copies of constant data +R_NSMAX=R%NSMAX +R_NTMAX=R%NTMAX +R_NDGNH=R%NDGNH +R_NDGL=R%NDGL + + +ALLOCATE(D_NSTAGT0B(SIZE(D%NSTAGT0B))) +ALLOCATE(D_NSTAGT1B(SIZE(D%NSTAGT1B))) +ALLOCATE(D_NPNTGTB0(0:SIZE(D%NPNTGTB0,1)-1,SIZE(D%NPNTGTB0,2))) +ALLOCATE(D_NPNTGTB1(SIZE(D%NPNTGTB1,1),SIZE(D%NPNTGTB1,2))) +ALLOCATE(D_MYMS(SIZE(D%MYMS))) +ALLOCATE(D_NPROCL(SIZE(D%NPROCL))) +ALLOCATE(D_NASM0(0:SIZE(D%NASM0)-1)) +ALLOCATE(D_NSTAGTF(SIZE(D%NSTAGTF))) +ALLOCATE(D_MSTABF(SIZE(D%MSTABF))) +ALLOCATE(D_NPROCM(0:SIZE(D%NPROCM)-1)) +ALLOCATE(D_NPTRLS(SIZE(D%NPTRLS))) + +ALLOCATE(G_NDGLU(0:SIZE(G%NDGLU)-1)) +ALLOCATE(G_NMEN(SIZE(G%NMEN))) +ALLOCATE(G_NLOEN(SIZE(G%NLOEN))) + +ALLOCATE(F_RW(SIZE(F%RW))) +ALLOCATE(F_RN(-1:SIZE(F%RN)-2)) +ALLOCATE(F_RLAPIN(-1:SIZE(F%RLAPIN)-2)) +ALLOCATE(F_RACTHE(SIZE(F%RACTHE))) + + +DO I=0,SIZE(G%NDGLU)-1 + G_NDGLU(I)=G%NDGLU(I) +END DO + +G_NMEN_MAX=0 +DO I=1,SIZE(G%NMEN) + G_NMEN(I)=G%NMEN(I) + IF (G_NMEN(I) .GT. G_NMEN_MAX) G_NMEN_MAX=G_NMEN(I) +END DO + +G_NLOEN_MAX=0 +DO I=1,SIZE(G%NLOEN) + G_NLOEN(I)=G%NLOEN(I) + IF (G_NLOEN(I) .GT. G_NLOEN_MAX) G_NLOEN_MAX=G_NLOEN(I) +END DO + +DO I=1,SIZE(D%NSTAGT0B) + D_NSTAGT0B(I)=D%NSTAGT0B(I) +END DO + +DO I=1,SIZE(D%NSTAGT1B) + D_NSTAGT1B(I)=D%NSTAGT1B(I) +END DO + +DO I=1,SIZE(D%NPROCL) + D_NPROCL(I)=D%NPROCL(I) +END DO + +DO I=0,SIZE(D%NASM0)-1 + D_NASM0(I)=D%NASM0(I) +END DO + +DO I=1,SIZE(D%NSTAGTF) + D_NSTAGTF(I)=D%NSTAGTF(I) +END DO + +DO I=1,SIZE(D%MSTABF) + D_MSTABF(I)=D%MSTABF(I) +END DO + +DO I=0,SIZE(D%NPROCM)-1 + D_NPROCM(I)=D%NPROCM(I) +END DO + +DO I=1,SIZE(D%NPTRLS) + D_NPTRLS(I)=D%NPTRLS(I) +END DO + +DO I=1,SIZE(D%NPNTGTB0,2) + DO J=0,SIZE(D%NPNTGTB0,1)-1 + D_NPNTGTB0(J,I)=D%NPNTGTB0(J,I) + END DO +END DO + +DO I=1,SIZE(D%NPNTGTB1,2) + DO J=1,SIZE(D%NPNTGTB1,1) + D_NPNTGTB1(J,I)=D%NPNTGTB1(J,I) + END DO +END DO + +D_OFFSETS_GEMM1 => D%OFFSETS_GEMM1 +D_OFFSETS_GEMM2 => D%OFFSETS_GEMM2 +#ifdef OMPGPU +#endif +#ifdef ACCGPU +!$ACC ENTER DATA COPYIN(D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) +#endif + +D_NUMP=D%NUMP +D_NDGL_FS=D%NDGL_FS + +KMLOC0 = -1 +DO I=1,SIZE(D%MYMS) + D_MYMS(I)=D%MYMS(I) + IF(D_MYMS(I) == 0) KMLOC0 = I +end DO + +! arrays for m=0 in ledir_mod: +IF(KMLOC0 >= 0) THEN + ALLOCATE(ZAA0(SIZE(ZAA,1),SIZE(ZAA,2))) + ALLOCATE(ZAS0(SIZE(ZAS,1),SIZE(ZAS,2))) + ZAA0 = ZAA(:,:,KMLOC0) + ZAS0 = ZAS(:,:,KMLOC0) +#ifdef ACCGPU + !$ACC ENTER DATA COPYIN(ZAA0,ZAS0) +#endif +#ifdef OMPGPU + !$OMP TARGET ENTER DATA MAP(TO:ZAA0,ZAS0) +#endif + WRITE(NOUT,*) 'GPU arrays for m=0 successfully allocated' +#ifdef ACCGPU + WRITE(NOUT,*) 'Using OpenACC' +#endif +#ifdef OMPGPU + WRITE(NOUT,*) 'Using OpenMP offloading' +#endif +ENDIF + +DO I=1,SIZE(F%RW) + F_RW(I)=F%RW(I) +END DO +DO I=-1,SIZE(F%RLAPIN)-2 + F_RLAPIN(I)=F%RLAPIN(I) +END DO +DO I=1,SIZE(F%RACTHE) + F_RACTHE(I)=F%RACTHE(I) +END DO +DO I=-1,SIZE(F%RN)-2 + F_RN(I)=F%RN(I) +END DO + +#ifdef ACCGPU +!$ACC ENTER DATA COPYIN(R_NSMAX,R_NTMAX,R_NDGL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B,& +!$ACC& D_NPNTGTB1,D_NPROCL,D_NUMP,D_NDGL_FS,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF,& +!$ACC& D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN,& +!$ACC& G_NLOEN_MAX,F_RW,F_RLAPIN,F_RN,F_RACTHE) +#endif +#ifdef OMPGPU +!$OMP TARGET ENTER DATA MAP(TO:R_NSMAX,R_NTMAX,R_NDGL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B) +!$OMP TARGET ENTER DATA MAP(TO:D_NPNTGTB1,D_NPROCL,D_NUMP,D_NDGL_FS,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF) +!$OMP TARGET ENTER DATA MAP(TO:D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN) +!$OMP TARGET ENTER DATA MAP(TO:G_NLOEN_MAX,F_RW,F_RLAPIN,F_RN,F_RACTHE) +#endif + +WRITE(NOUT,*) '===GPU arrays successfully allocated' +#ifdef ACCGPU +!$ACC wait +#endif +#ifdef OMPGPU +!$OMP BARRIER +#endif + +! free memory +!DO JMLOC=1,D%NUMP +! DEALLOCATE(S%FA(JMLOC)%RPNMA) +! DEALLOCATE(S%FA(JMLOC)%RPNMS) +!ENDDO + +!endif INTERFACE + +ENDIF + +END SUBROUTINE SETUP_TRANS diff --git a/src/trans/gpu/external/setup_trans0.F90 b/src/trans/gpu/external/setup_trans0.F90 new file mode 100755 index 00000000..efa5b87b --- /dev/null +++ b/src/trans/gpu/external/setup_trans0.F90 @@ -0,0 +1,300 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE SETUP_TRANS0(KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR,& +& KPRGPNS,KPRGPEW,KPRTRW,KCOMBFLEN,& +& LDMPOFF,LDSYNC_TRANS,KTRANS_SYNC_LEVEL,& +& LDEQ_REGIONS,K_REGIONS_NS,K_REGIONS_EW,K_REGIONS,& +& PRAD,LDALLOPERM,KOPT_MEMORY_TR) + +!**** *SETUP_TRANS0* - General setup routine for transform package + +! Purpose. +! -------- +! Resolution independent part of setup of transform package +! Has to be called BEFORE SETUP_TRANS + +!** Interface. +! ---------- +! CALL SETUP_TRANS0(...) + +! Explicit arguments : All arguments are optional, [..] default value +! ------------------- +! KOUT - Unit number for listing output [6] +! KERR - Unit number for error messages [0] +! KPRINTLEV - level of output to KOUT, 0->no output,1->normal,2->debug [0] +! KMAX_RESOL - maximum number of different resolutions for this run [1] +! KPRGPNS - splitting level in N-S direction in grid-point space [1] +! KPRGPEW - splitting level in E-W direction in grid-point space [1] +! KPRTRW - splitting level in wave direction in spectral space [1] +! KCOMBFLEN - Size of communication buffer [1800000 (*8bytes) ] +! LDMPOFF - switch off message passing [false] +! LDSYNC_TRANS - switch to activate barriers in trmtol trltom [false] +! KTRANS_SYNC_LEVEL - use of synchronization/blocking [0] +! LDEQ_REGIONS - true if new eq_regions partitioning [false] +! K_REGIONS - Number of regions (1D or 2D partitioning) +! K_REGIONS_NS - Maximum number of NS partitions +! K_REGIONS_EW - Maximum number of EW partitions +! PRAD - Radius of the planet +! LDALLOPERM - Allocate certain arrays permanently +! KOPT_MEMORY_TR - memory strategy (stack vs heap) in gripoint transpositions + +! The total number of (MPI)-processors has to be equal to KPRGPNS*KPRGPEW + +! Method. +! ------- + +! Externals. SUMP_TRANS0 - initial setup routine +! ---------- + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 +! R. El Khatib 03-01-24 LDMPOFF +! G. Mozdzynski 2006-09-13 LDEQ_REGIONS +! N. Wedi 2009-11-30 add radius +! R. El Khatib 09-Sep-2020 NSTACK_MEMORY_TR + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB, JPRD + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NERR, NOUT, LMPOFF, LSYNC_TRANS, NTRANS_SYNC_LEVEL, MSETUP0, & + & NMAX_RESOL, NPRINTLEV, NPROMATR, LALLOPERM +USE TPM_DISTR ,ONLY : LEQ_REGIONS, NCOMBFLEN, NPRGPEW,NPRGPNS, NPRTRW, NPRTRV, MYSETV +USE TPM_CONSTANTS ,ONLY : RA +USE MPL_MODULE + +USE SUMP_TRANS0_MOD ,ONLY : SUMP_TRANS0 +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE EQ_REGIONS_MOD ,ONLY : N_REGIONS, N_REGIONS_EW, N_REGIONS_NS +USE ECTRANS_VERSION_MOD ,ONLY : ECTRANS_VERSION_STR, ECTRANS_GIT_SHA1 +USE EC_ENV_MOD ,ONLY : EC_GETENV +#ifdef _OPENACC +USE OPENACC +#endif + +!endif INTERFACE + +IMPLICIT NONE + +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KOUT,KERR,KPRINTLEV,KMAX_RESOL,KPROMATR +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KPRGPNS,KPRGPEW,KPRTRW,KCOMBFLEN +LOGICAL ,OPTIONAL,INTENT(IN) :: LDMPOFF +LOGICAL ,OPTIONAL,INTENT(IN) :: LDSYNC_TRANS +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KTRANS_SYNC_LEVEL +LOGICAL ,OPTIONAL,INTENT(IN) :: LDEQ_REGIONS +LOGICAL ,OPTIONAL,INTENT(IN) :: LDALLOPERM +REAL(KIND=JPRD) ,OPTIONAL,INTENT(IN) :: PRAD +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(IN) :: KOPT_MEMORY_TR +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT) :: K_REGIONS(:) +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT) :: K_REGIONS_NS +INTEGER(KIND=JPIM) ,OPTIONAL,INTENT(OUT) :: K_REGIONS_EW + +INTEGER(KIND=JPIM) :: MYPROC +INTEGER :: IDEVICE_NUM, IPROC_PERNODE +#ifdef _OPENACC +INTEGER(ACC_DEVICE_KIND) :: IDEVTYPE, IDEVICE_TYPE +#endif +INTEGER :: NUMDEVS, IERROR, MYGPU +CHARACTER(LEN=2) :: CL_NPROC_PERNODE + +!ifndef INTERFACE + +LOGICAL :: LLP1,LLP2 + +! ------------------------------------------------------------------ + +IF( LDMPOFF ) THEN + MYPROC = 1 +ELSE + MYPROC = MPL_MYRANK() +ENDIF + + +!!CALL GSTATS_LABEL_IFS() +#ifdef _OPENACC +IDEVTYPE=ACC_GET_DEVICE_TYPE() +NUMDEVS = ACC_GET_NUM_DEVICES(IDEVTYPE) +MYGPU = MOD(MYPROC-1,NUMDEVS) +CALL ACC_SET_DEVICE_NUM(MYGPU, IDEVTYPE) +MYGPU = ACC_GET_DEVICE_NUM(IDEVTYPE) +WRITE(*,*) 'MYPROC:',MYPROC, 'GPU:', MYGPU, 'of ', NUMDEVS +#endif + +CL_NPROC_PERNODE=' ' +CALL EC_GETENV('NPROC_PERNODE',CL_NPROC_PERNODE) +IF( CL_NPROC_PERNODE /= ' ')THEN + READ(CL_NPROC_PERNODE,*) IPROC_PERNODE + IDEVICE_NUM=MOD(MYPROC-1,IPROC_PERNODE) + WRITE(0,'("TRANSFORM TEST: MYPROC=",I8," CL_NPROC_PERNODE=",A," IPROC_PERNODE=",I2,& + & " IDEVICE_NUM=",I2)') MYPROC,CL_NPROC_PERNODE,IPROC_PERNODE,IDEVICE_NUM + IDEVICE_TYPE=0 + !!CALL ACC_SET_DEVICE_NUM(IDEVICE_NUM,ACC_DEVICE_NVIDIA) + CALL ACC_SET_DEVICE_NUM(IDEVICE_NUM,IDEVTYPE) + !!CALL ACC_INIT(ACC_DEVICE_NVIDIA) + CALL ACC_INIT(IDEVTYPE) + !$OMP PARALLEL + !!CALL ACC_SET_DEVICE_NUM(IDEVICE_NUM,ACC_DEVICE_NVIDIA) + CALL ACC_SET_DEVICE_NUM(IDEVICE_NUM,IDEVTYPE) + !!CALL ACC_INIT(ACC_DEVICE_NVIDIA) + CALL ACC_INIT(IDEVTYPE) +!$OMP END PARALLEL +ENDIF + +IF(MSETUP0 /= 0) THEN +!gr CALL ABORT_TRANS('SETUP_TRANS0: SETUP_TRANS0 MAY ONLY BE CALLED ONCE') +ENDIF + +! Default values + +NOUT = 6 +NERR = 0 +NPRINTLEV = 0 +NMAX_RESOL = 1 +NPRGPNS = 1 +NPRGPEW = 1 +NPRTRW = 1 +N_REGIONS_NS=1 +N_REGIONS_EW=1 +NPROMATR = 0 +NCOMBFLEN = 1800000 +LMPOFF = .FALSE. +LSYNC_TRANS=.FALSE. +NTRANS_SYNC_LEVEL=0 +LEQ_REGIONS=.FALSE. +RA=6371229._JPRB +LALLOPERM=.FALSE. + +! Optional arguments + +IF(PRESENT(KOUT)) THEN + NOUT = KOUT +ENDIF +IF(PRESENT(KERR)) THEN + NERR = KERR +ENDIF +IF(PRESENT(KPRINTLEV)) THEN + NPRINTLEV = KPRINTLEV +ENDIF + +! Print ecTrans version information +WRITE(NOUT,'(A)') +WRITE(NOUT,'(A)') "ecTrans at version: " // ECTRANS_VERSION_STR() +WRITE(NOUT,'(A)') "commit: " // ECTRANS_GIT_SHA1() +WRITE(NOUT,'(A)') +WRITE(NOUT,'(A)') "GPU version, with following compile-time options : " +#ifdef ACCGPU + WRITE(NOUT,'(A)') " - OpenACC-based offload" +#else + WRITE(NOUT,'(A)') " - OpenMP-based offload" +#endif +#ifdef USE_GPU_AWARE_MPI + WRITE(NOUT,'(A)') " - GPU-aware MPI" +#endif +#ifdef USE_GRAPHS_GEMM + WRITE(NOUT,'(A)') " - graph-based GEMM scheduling" +#endif +#ifdef USE_CUTLASS + WRITE(NOUT,'(A)') " - Cutlass-based GEMM operations" +#endif +#ifdef USE_3XTF32 + WRITE(NOUT,'(A)') " - tensor-core usage for 32b Cutlass operations" +#endif +WRITE(NOUT,'(A)') + +LLP1 = NPRINTLEV>0 +LLP2 = NPRINTLEV>1 +IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SETUP_TRANS0 ===' + +IF(PRESENT(KMAX_RESOL))THEN + NMAX_RESOL = KMAX_RESOL +ENDIF +IF(PRESENT(KPROMATR))THEN + IF(MOD(KPROMATR,2) /= 0) THEN + CALL ABORT_TRANS('SETUP_TRANS0: KPROMATR HAS TO BE MULTIPLE OF 2') + ENDIF + NPROMATR = KPROMATR +ENDIF +IF(PRESENT(KPRGPNS)) THEN + NPRGPNS = KPRGPNS +ENDIF +IF(PRESENT(KPRGPEW)) THEN + NPRGPEW = KPRGPEW +ENDIF +IF(PRESENT(KPRTRW)) THEN + NPRTRW = KPRTRW +ENDIF +IF(PRESENT(KCOMBFLEN)) THEN + NCOMBFLEN = KCOMBFLEN +ENDIF +IF(PRESENT(LDMPOFF)) THEN + LMPOFF = LDMPOFF +ENDIF +IF(PRESENT(LDSYNC_TRANS)) THEN + LSYNC_TRANS = LDSYNC_TRANS +ENDIF +IF(PRESENT(KTRANS_SYNC_LEVEL)) THEN + NTRANS_SYNC_LEVEL = KTRANS_SYNC_LEVEL +ENDIF +IF(PRESENT(LDEQ_REGIONS)) THEN + LEQ_REGIONS = LDEQ_REGIONS +ENDIF +IF(PRESENT(KOPT_MEMORY_TR)) THEN + WRITE(NOUT,'(A)') + WRITE(NOUT,'(A)') '*** WARNING ***' + WRITE(NOUT,'(A)') 'KOPT_MEMORY_TR argument passed to SETUP_TRANS0 will be ignored' + WRITE(NOUT,'(A)') 'This option only applies to the CPU version of ecTrans' + WRITE(NOUT,'(A)') +ENDIF + +! Initial setup +CALL SUMP_TRANS0 + +IF(PRESENT(K_REGIONS_NS)) THEN + K_REGIONS_NS = N_REGIONS_NS +ENDIF + +IF(PRESENT(K_REGIONS_EW)) THEN + K_REGIONS_EW = N_REGIONS_EW +ENDIF + +IF(PRESENT(K_REGIONS)) THEN + IF(UBOUND(K_REGIONS,1) < N_REGIONS_NS) THEN + CALL ABORT_TRANS('SETUP_TRANS0: K_REGIONS TOO SMALL') + ELSE + K_REGIONS(1:N_REGIONS_NS)=N_REGIONS(1:N_REGIONS_NS) + ENDIF +ENDIF + +IF(PRESENT(PRAD)) THEN + RA=PRAD +ENDIF + +IF(PRESENT(LDALLOPERM)) THEN + LALLOPERM=LDALLOPERM +ENDIF + +! Setup level 0 complete +MSETUP0 = 1 + +! ------------------------------------------------------------------ + +!endif INTERFACE + +END SUBROUTINE SETUP_TRANS0 + + diff --git a/src/trans/gpu/external/specnorm.F90 b/src/trans/gpu/external/specnorm.F90 new file mode 100755 index 00000000..1e032da7 --- /dev/null +++ b/src/trans/gpu/external/specnorm.F90 @@ -0,0 +1,140 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + + +SUBROUTINE SPECNORM(PNORM,PSPEC,KVSET,KMASTER,KRESOL,PMET) +!**** *SPECNORM* - Compute global spectral norms + +! Purpose. +! -------- +! Interface routine for computing spectral norms + +!** Interface. +! ---------- +! CALL SPECNORM(...) + +! Explicit arguments : All arguments optional +! -------------------- +! PSPEC(:,:) - Spectral array +! KVSET(:) - "B-Set" for each field +! KMASTER - processor to recieve norms +! KRESOL - resolution tag which is required ,default is the +! first defined resulution (input) +! PMET(:) - metric +! PNORM(:) - Norms (output for processor KMASTER) +! +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- SPNORM_CTL - control routine + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NERR +USE TPM_DISTR ,ONLY : D, NPRTRV, MYSETV, MYPROC + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE SPNORM_CTL_MOD ,ONLY : SPNORM_CTL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + + +REAL(KIND=JPRB) , INTENT(OUT) :: PNORM(:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPEC(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSET(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KMASTER +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PMET(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +!ifndef INTERFACE + +INTEGER(KIND=JPIM) :: IMASTER,IFLD,IFLD_G,J + +! ------------------------------------------------------------------ + +! Set current resolution +CALL SET_RESOL(KRESOL) + +! Set defaults +IMASTER = 1 +IFLD = 0 + + +IF(PRESENT(KMASTER)) THEN + IMASTER = KMASTER +ENDIF + +IF(PRESENT(KVSET)) THEN + IFLD_G = UBOUND(KVSET,1) + DO J=1,IFLD_G + IF(KVSET(J) > NPRTRV) THEN + WRITE(NERR,*) 'SPECNORM:KVSET(J) > NPRTRV ',J,KVSET(J),NPRTRV + CALL ABORT_TRANS('SPECNORM:KVSET TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSET(J) == MYSETV) THEN + IFLD = IFLD+1 + ENDIF + ENDDO +ELSE + IF(PRESENT(PSPEC)) THEN + IFLD = UBOUND(PSPEC,1) + ENDIF + IFLD_G = IFLD +ENDIF + +IF(NPRTRV >1) THEN + IF(IFLD > 0 .AND. .NOT. PRESENT(KVSET)) THEN + WRITE(NERR,*)'NPRTRV >1 AND IFLD > 0 AND NOT PRESENT(KVSET)',& + &NPRTRV,IFLD + CALL ABORT_TRANS('SPECNORM: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF +ENDIF +IF(MYPROC == IMASTER) THEN + IF(UBOUND(PNORM,1) < IFLD_G) THEN + CALL ABORT_TRANS('SPECNORM: PNORM TOO SMALL') + ENDIF +ENDIF +IF(IFLD > 0 ) THEN + IF(.NOT. PRESENT(PSPEC)) THEN + CALL ABORT_TRANS('SPECNORM: PSPEC NOT PRESENT') + ENDIF + IF(UBOUND(PSPEC,1) < IFLD) THEN + CALL ABORT_TRANS('SPECNORM: FIRST DIMENSION OF PSPEC TOO SMALL') + ENDIF + IF(UBOUND(PSPEC,2) < D%NSPEC2) THEN + CALL ABORT_TRANS('SPECNORM: FIRST DIMENSION OF PSPEC TOO SMALL') + ENDIF +ENDIF + +CALL SPNORM_CTL(PNORM,PSPEC,IFLD,IFLD_G,KVSET,IMASTER,PMET) + +!endif INTERFACE + +! ------------------------------------------------------------------ + +END SUBROUTINE SPECNORM + diff --git a/src/trans/gpu/external/sugawc.F90 b/src/trans/gpu/external/sugawc.F90 new file mode 100755 index 00000000..8f23f491 --- /dev/null +++ b/src/trans/gpu/external/sugawc.F90 @@ -0,0 +1,102 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE SUGAWC(KDGLG,PMU,PW) + +!**** *SUGAWC* - Compute Gaussian latitudes and weights + +! Purpose. +! -------- +! Compute Gaussian latitudes and weights. + +!** Interface. +! ---------- +! CALL SUGAWC(...) + +! Explicit arguments : +! -------------------- +! INPUT: +! KDGLG - number of latitudes. + +! OUTPUT: +! PMU - sine of Gaussian latitudes. +! PW - Gaussian weights. + +! Method. +! ------- + +! Externals. SUGAW +! ---------- + +! Author. +! ------- +! K. Yessad, from SUGAWA and SULEG (trans) +! Original : May 2012 + +! Modifications. +! -------------- +! F. Vana 05-Mar-2015 Support for single precision + +! ------------------------------------------------------------------ + +USE EC_PARKIND ,ONLY : JPRD, JPIM + +!ifndef INTERFACE + +USE SUGAW_MOD, ONLY : SUGAW + +!endif INTERFACE + +! ------------------------------------------------------------------ + +IMPLICIT NONE + +INTEGER(KIND=JPIM) ,INTENT(IN) :: KDGLG +REAL(KIND=JPRD) ,INTENT(OUT) :: PMU(:) +REAL(KIND=JPRD) ,INTENT(OUT) :: PW(:) + +!ifndef INTERFACE + +REAL(KIND=JPRD) :: ZANM +INTEGER(KIND=JPIM) :: ISTART,IODD,JN,JGL +REAL(KIND=JPRD) :: ZFN(0:KDGLG,0:KDGLG) +REAL(KIND=JPRD) :: ZFNN + +! ------------------------------------------------------------------ + +! * preliminary calculations to compute input quantities ZANM and ZFN +! (k.y.: coded after what I found in tfl/module/suleg_mod.F90). +ISTART=1 +! Belousov, Swarztrauber use ZFN(0,0)=SQRT(2._JPRD) +! IFS normalisation chosen to be 0.5*Integral(Pnm**2) = 1 +ZFN(0,0)=2._JPRD +DO JN=ISTART,KDGLG + ZFNN=ZFN(0,0) + DO JGL=1,JN + ZFNN=ZFNN*SQRT(1._JPRD-0.25_JPRD/REAL(JGL**2,JPRD)) + ENDDO + IODD=MOD(JN,2) + ZFN(JN,JN)=ZFNN + DO JGL=2,JN-IODD,2 + ZFN(JN,JN-JGL)=ZFN(JN,JN-JGL+2)*REAL((JGL-1)*(2*JN-JGL+2),JPRD)/REAL(JGL*(2*JN-JGL+1),JPRD) + ENDDO +ENDDO + +ZANM=SQRT(REAL(2*KDGLG+1,JPRD)*REAL(KDGLG**2,JPRD)/REAL(2*KDGLG-1,JPRD)) + +! * call to SUGAW (output: PW, PMU): +CALL SUGAW(KDGLG,0,KDGLG,PMU,PW,ZANM,ZFN) + +! ------------------------------------------------------------------ + +!endif INTERFACE + +END SUBROUTINE SUGAWC + diff --git a/src/trans/gpu/external/trans_end.F90 b/src/trans/gpu/external/trans_end.F90 new file mode 100755 index 00000000..e3f6f8b9 --- /dev/null +++ b/src/trans/gpu/external/trans_end.F90 @@ -0,0 +1,154 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE TRANS_END(CDMODE) + +!**** *TRANS_END* - Terminate transform package + +! Purpose. +! -------- +! Terminate transform package. Release all allocated arrays. + +!** Interface. +! ---------- +! CALL TRANS_END + +! Explicit arguments : None +! -------------------- + +! Method. +! ------- + +! Externals. None +! ---------- + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 +! G. Radnoti: 19-03-2009: intermediate end of transf to allow to switch to mono-task transforms +! R. El Khatib 09-Jul-2013 LENABLED + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM, JPRB + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : MSETUP0, NCUR_RESOL, NMAX_RESOL, LENABLED,NDEF_RESOL +USE TPM_DIM ,ONLY : R, DIM_RESOL, R_NSMAX,R_NTMAX, R_NDGNH, R_NDGL +USE TPM_DISTR ,ONLY : D, DISTR_RESOL, NPRCIDS,D_NUMP,D_MYMS,D_NSTAGT0B,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1, D_NASM0, & +& D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS +USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX +USE TPM_FIELDS ,ONLY : F, FIELDS_RESOL,F_RW,ZEPSNM,ZAA,ZAS,ZAA0,ZAS0 +USE TPM_FFT ,ONLY : T, FFT_RESOL +USE TPM_CTL ,ONLY : C, CTL_RESOL +USE TPM_FLT +USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN + +USE EQ_REGIONS_MOD ,ONLY : N_REGIONS +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE DEALLOC_RESOL_MOD ,ONLY : DEALLOC_RESOL +! + +IMPLICIT NONE +CHARACTER*5, OPTIONAL, INTENT(IN) :: CDMODE +! Local variables +INTEGER(KIND=JPIM) :: JRES +CHARACTER*5 :: CLMODE +! ------------------------------------------------------------------ +CLMODE='FINAL' +IF (PRESENT(CDMODE)) CLMODE=CDMODE +IF (CLMODE == 'FINAL') THEN + +#ifdef ACCGPU + !$ACC EXIT DATA DELETE(ZAA0,ZAS0,ZEPSNM,ZAA,ZAS) +#endif +#ifdef OMPGPU +#endif + DEALLOCATE(ZAA0) + DEALLOCATE(ZAS0) + DEALLOCATE(ZEPSNM) + DEALLOCATE(ZAA) + DEALLOCATE(ZAS) + + + DEALLOCATE(D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_MYMS,D_NPROCL,D_NASM0,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NLOEN,F_RW) +#ifdef ACCGPU + !$ACC EXIT DATA DELETE(R_NSMAX,R_NTMAX,R_NDGL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_NPROCL, D_NUMP,D_MYMS, & + !$ACC& G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN,G_NLOEN_MAX,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,D_NASM0,F_RW) + +#endif +#ifdef OMPGPU + !$OMP TARGET EXIT DATA MAP(DELETE: ) +#endif + !CALL HIP_DGEMM_BATCHED_FINALIZE() + + IF( ALLOCATED( LENABLED ) ) THEN + DO JRES=1,NMAX_RESOL + IF(LENABLED(JRES)) THEN + CALL DEALLOC_RESOL(JRES) + ENDIF + ENDDO + DEALLOCATE(LENABLED) + ENDIF + + NULLIFY(R) + IF( ALLOCATED(DIM_RESOL) ) DEALLOCATE(DIM_RESOL) + + NULLIFY(D) + IF( ALLOCATED(DISTR_RESOL) ) DEALLOCATE(DISTR_RESOL) + + !TPM_FFT + NULLIFY(T) + IF( ALLOCATED(FFT_RESOL) ) DEALLOCATE(FFT_RESOL) + + !TPM_FLT + NULLIFY(S) + IF( ALLOCATED(FLT_RESOL) ) DEALLOCATE(FLT_RESOL) + + !TPM_CTL + NULLIFY(C) + IF( ALLOCATED(CTL_RESOL) ) DEALLOCATE(CTL_RESOL) + + !TPM_FIELDS + NULLIFY(F) + IF( ALLOCATED(FIELDS_RESOL) ) DEALLOCATE(FIELDS_RESOL) + + + !TPM_GEOMETRY + NULLIFY(G) + IF( ALLOCATED(GEOM_RESOL) ) DEALLOCATE(GEOM_RESOL) + + !TPM_TRANS + IF(ALLOCATED(FOUBUF_IN)) DEALLOCATE(FOUBUF_IN) + IF(ALLOCATED(FOUBUF)) DEALLOCATE(FOUBUF) + + MSETUP0 = 0 + NMAX_RESOL = 0 + NCUR_RESOL = 0 + NDEF_RESOL = 0 +ENDIF +IF (CLMODE == 'FINAL' .OR. CLMODE == 'INTER') THEN + !EQ_REGIONS + IF( ASSOCIATED(N_REGIONS) ) DEALLOCATE(N_REGIONS) + !TPM_DISTR + IF( ALLOCATED(NPRCIDS) ) DEALLOCATE(NPRCIDS) +ENDIF + +! ------------------------------------------------------------------ + +!endif INTERFACE + +END SUBROUTINE TRANS_END diff --git a/src/trans/gpu/external/trans_inq.F90 b/src/trans/gpu/external/trans_inq.F90 new file mode 100755 index 00000000..b0c19042 --- /dev/null +++ b/src/trans/gpu/external/trans_inq.F90 @@ -0,0 +1,529 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE TRANS_INQ(KRESOL,KSPEC,KSPEC2,KSPEC2G,KSPEC2MX,KNUMP,& + &KGPTOT,KGPTOTG,KGPTOTMX,KGPTOTL,& + &KMYMS,KASM0,KUMPP,KPOSSP,KPTRMS,KALLMS,KDIM0G,& + &KFRSTLAT,KLSTLAT,KFRSTLOFF,KPTRLAT,& + &KPTRFRSTLAT,KPTRLSTLAT,KPTRFLOFF,KSTA,KONL,& + &KULTPP,KPTRLS,KNMENG,& + &KPRTRW,KMYSETW,KMYSETV,KMY_REGION_NS,KMY_REGION_EW,& + &LDSPLITLAT,& + &KSMAX,PLAPIN,KNVALUE,KDEF_RESOL,LDLAM,& + &PMU,PGW,PRPNM,KLEI3,KSPOLEGL,KPMS,KDGLU) + +!**** *TRANS_INQ* - Extract information from the transform package + +! Purpose. +! -------- +! Interface routine for extracting information from the T.P. + +!** Interface. +! ---------- +! CALL TRANS_INQ(...) +! Explicit arguments : All arguments are optional. +! -------------------- +! KRESOL - resolution tag for which info is required ,default is the +! first defined resulution (input) + +! MULTI-TRANSFORMS MANAGEMENT +! KDEF_RESOL - number or resolutions defined +! LDLAM - .T. if the corresponding resolution is LAM, .F. if it is global + +! SPECTRAL SPACE +! KSPEC - number of complex spectral coefficients on this PE +! KSPEC2 - 2*KSPEC +! KSPEC2G - global KSPEC2 +! KSPEC2MX - maximun KSPEC2 among all PEs +! KNUMP - Number of spectral waves handled by this PE +! KGPTOT - Total number of grid columns on this PE +! KGPTOTG - Total number of grid columns on the Globe +! KGPTOTMX - Maximum number of grid columns on any of the PEs +! KGPTOTL - Number of grid columns one each PE (dimension N_REGIONS_NS:N_REGIONS_EW) +! KMYMS - This PEs spectral zonal wavenumbers +! KASM0 - Address in a spectral array of (m, n=m) +! KUMPP - No. of wave numbers each wave set is responsible for +! KPOSSP - Defines partitioning of global spectral fields among PEs +! KPTRMS - Pointer to the first wave number of a given a-set +! KALLMS - Wave numbers for all wave-set concatenated together +! to give all wave numbers in wave-set order +! KDIM0G - Defines partitioning of global spectral fields among PEs +! KSMAX - spectral truncation +! KNVALUE - n value for each KSPEC2 spectral coeffient + +! GRIDPOINT SPACE +! KFRSTLAT - First latitude of each a-set in grid-point space +! KLSTTLAT - Last latitude of each a-set in grid-point space +! KFRSTLOFF - Offset for first lat of own a-set in grid-point space +! KPTRLAT - Pointer to the start of each latitude +! KPTRFRSTLAT - Pointer to the first latitude of each a-set in +! NSTA and NONL arrays +! KPTRLSTLAT - Pointer to the last latitude of each a-set in +! NSTA and NONL arrays +! KPTRFLOFF - Offset for pointer to the first latitude of own a-set +! NSTA and NONL arrays, i.e. nptrfrstlat(myseta)-1 +! KSTA - Position of first grid column for the latitudes on a +! processor. The information is available for all processors. +! The b-sets are distinguished by the last dimension of +! nsta().The latitude band for each a-set is addressed by +! nptrfrstlat(jaset),nptrlstlat(jaset), and +! nptrfloff=nptrfrstlat(myseta) on this processors a-set. +! Each split latitude has two entries in nsta(,:) which +! necessitates the rather complex addressing of nsta(,:) +! and the overdimensioning of nsta by N_REGIONS_NS. +! KONL - Number of grid columns for the latitudes on a processor. +! Similar to nsta() in data structure. +! LDSPLITLAT - TRUE if latitude is split in grid point space over +! two a-sets + +! FOURIER SPACE +! KULTPP - number of latitudes for which each a-set is calculating +! the FFT's. +! KPTRLS - pointer to first global latitude of each a-set for which +! it performs the Fourier calculations +! KNMENG - associated (with NLOENG) cut-off zonal wavenumber + +! LEGENDRE +! PMU - sin(Gaussian latitudes) +! PGW - Gaussian weights +! PRPNM - Legendre polynomials +! KLEI3 - First dimension of Legendre polynomials +! KSPOLEGL - Second dimension of Legendre polynomials +! KPMS - Adress for legendre polynomial for given M (NSMAX) +! PLAPIN - Eigen-values of the inverse Laplace operator +! KDGLU - Number of active points in an hemisphere for a given wavenumber "m" + +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 +! M. Hortal : 2001-03-05 Dimensions of the Legendre polynomials +! R. El Khatib 08-Aug-2012 KSMAX,PLAPIN,KNVALUE,LDLAM,KDEF_RESOL + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB, JPRD + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NDEF_RESOL +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D, NPRTRNS, NPRTRW, MYSETV, MYSETW, NPRTRV +USE TPM_GEOMETRY ,ONLY : G +USE TPM_FIELDS ,ONLY : F +USE TPM_FLT + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS, & + & N_REGIONS_EW, N_REGIONS_NS + +!endif INTERFACE + +IMPLICIT NONE + +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL + +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KSPEC +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KSPEC2 +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KSPEC2G +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KSPEC2MX +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KNUMP +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KGPTOT +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KGPTOTG +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KGPTOTMX +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KGPTOTL(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KFRSTLOFF +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KPTRFLOFF + +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KMYMS(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KASM0(0:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KUMPP(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KPOSSP(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KPTRMS(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KALLMS(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KDIM0G(0:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KFRSTLAT(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KLSTLAT(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KPTRLAT(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KPTRFRSTLAT(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KPTRLSTLAT(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KSTA(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KONL(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KPRTRW +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KMYSETW +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KMYSETV +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KMY_REGION_NS +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KMY_REGION_EW +LOGICAL ,OPTIONAL, INTENT(OUT) :: LDSPLITLAT(:) + +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KULTPP(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KPTRLS(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KNMENG(:) + +REAL(KIND=JPRD) ,OPTIONAL, INTENT(OUT) :: PMU(:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PGW(:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PRPNM(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KLEI3 +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KSPOLEGL +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KPMS(0:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KDGLU(0:) + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PLAPIN(-1:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KSMAX +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KNVALUE(:) + +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(OUT) :: KDEF_RESOL +LOGICAL ,OPTIONAL,INTENT(OUT) :: LDLAM + +!ifndef INTERFACE + +INTEGER(KIND=JPIM) :: IU1,IU2 +INTEGER(KIND=JPIM) :: IC, JN, JMLOC +INTEGER(KIND=JPIM) :: IPRTRV,JSETV,IMLOC,IM,ISL,IA,ILA,IS,ILS,IDGLU,J,I +! ------------------------------------------------------------------ + + +! Set current resolution +CALL SET_RESOL(KRESOL) + +IF(PRESENT(KSPEC)) KSPEC = D%NSPEC +IF(PRESENT(KSPEC2)) KSPEC2 = D%NSPEC2 +IF(PRESENT(KSPEC2G)) KSPEC2G = R%NSPEC2_G +IF(PRESENT(KSPEC2MX)) KSPEC2MX = D%NSPEC2MX +IF(PRESENT(KNUMP)) KNUMP = D%NUMP +IF(PRESENT(KGPTOT)) KGPTOT = D%NGPTOT +IF(PRESENT(KGPTOTG)) KGPTOTG = D%NGPTOTG +IF(PRESENT(KGPTOTMX)) KGPTOTMX = D%NGPTOTMX +IF(PRESENT(KFRSTLOFF)) KFRSTLOFF = D%NFRSTLOFF +IF(PRESENT(KPTRFLOFF)) KPTRFLOFF = D%NPTRFLOFF +IF(PRESENT(KPTRFLOFF)) KPTRFLOFF = D%NPTRFLOFF +IF(PRESENT(KPRTRW)) KPRTRW = NPRTRW +IF(PRESENT(KMYSETW)) KMYSETW = MYSETW +IF(PRESENT(KMYSETV)) KMYSETV = MYSETV +IF(PRESENT(KMY_REGION_NS)) KMY_REGION_NS = MY_REGION_NS +IF(PRESENT(KMY_REGION_EW)) KMY_REGION_EW = MY_REGION_EW +IF(PRESENT(LDLAM)) LDLAM = G%LAM +IF(PRESENT(KDEF_RESOL)) KDEF_RESOL = NDEF_RESOL + +IF(PRESENT(KGPTOTL)) THEN + IF(UBOUND(KGPTOTL,1) < N_REGIONS_NS) THEN + CALL ABORT_TRANS('TRANS_INQ: KGPTOTL DIM 1 TOO SMALL') + ELSEIF(UBOUND(KGPTOTL,2) < N_REGIONS_EW) THEN + CALL ABORT_TRANS('TRANS_INQ: KGPTOTL DIM 2 TOO SMALL') + ELSE + KGPTOTL(1:N_REGIONS_NS,1:N_REGIONS_EW) = D%NGPTOTL(:,:) + ENDIF +ENDIF + +IF(PRESENT(KMYMS)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KMYMS REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KMYMS,1) < D%NUMP) THEN + CALL ABORT_TRANS('TRANS_INQ: KMYMS TOO SMALL') + ELSE + KMYMS(1:D%NUMP) = D%MYMS(:) + ENDIF +ENDIF + +IF(PRESENT(KASM0)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KASM0 REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KASM0,1) < R%NSMAX) THEN + CALL ABORT_TRANS('TRANS_INQ: KASM0 TOO SMALL') + ELSE + KASM0(0:R%NSMAX) = D%NASM0(:) + ENDIF +ENDIF + +IF(PRESENT(KUMPP)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KUMPP REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KUMPP,1) < NPRTRW) THEN + CALL ABORT_TRANS('TRANS_INQ: KUMPP TOO SMALL') + ELSE + KUMPP(1:NPRTRW) = D%NUMPP(:) + ENDIF +ENDIF + +IF(PRESENT(KPOSSP)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KPOSSP REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KPOSSP,1) < NPRTRW+1) THEN + CALL ABORT_TRANS('TRANS_INQ: KPOSSP TOO SMALL') + ELSE + KPOSSP(1:NPRTRW+1) = D%NPOSSP(:) + ENDIF +ENDIF + +IF(PRESENT(KPTRMS)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KPTRMS REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KPTRMS,1) < NPRTRW) THEN + CALL ABORT_TRANS('TRANS_INQ: KPTRMS TOO SMALL') + ELSE + KPTRMS(1:NPRTRW) = D%NPTRMS(:) + ENDIF +ENDIF + +IF(PRESENT(KALLMS)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KALLMS REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KALLMS,1) < R%NSMAX+1) THEN + CALL ABORT_TRANS('TRANS_INQ: KALLMS TOO SMALL') + ELSE + KALLMS(1:R%NSMAX+1) = D%NALLMS(:) + ENDIF +ENDIF + +IF(PRESENT(KDIM0G)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KDIM0G REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KDIM0G,1) < R%NSMAX) THEN + CALL ABORT_TRANS('TRANS_INQ: KDIM0G TOO SMALL') + ELSE + KDIM0G(0:R%NSMAX) = D%NDIM0G(0:R%NSMAX) + ENDIF +ENDIF + +IF(PRESENT(KFRSTLAT)) THEN + IF(UBOUND(KFRSTLAT,1) < N_REGIONS_NS) THEN + CALL ABORT_TRANS('TRANS_INQ: KFRSTLAT TOO SMALL') + ELSE + KFRSTLAT(1:N_REGIONS_NS) = D%NFRSTLAT(:) + ENDIF +ENDIF + +IF(PRESENT(KLSTLAT)) THEN + IF(UBOUND(KLSTLAT,1) < N_REGIONS_NS) THEN + CALL ABORT_TRANS('TRANS_INQ: KLSTLAT TOO SMALL') + ELSE + KLSTLAT(1:N_REGIONS_NS) = D%NLSTLAT(:) + ENDIF +ENDIF + +IF(PRESENT(KPTRLAT)) THEN + IF(UBOUND(KPTRLAT,1) < R%NDGL) THEN + CALL ABORT_TRANS('TRANS_INQ: KPTRLAT TOO SMALL') + ELSE + KPTRLAT(1:R%NDGL) = D%NPTRLAT(:) + ENDIF +ENDIF + +IF(PRESENT(KPTRFRSTLAT)) THEN + IF(UBOUND(KPTRFRSTLAT,1) < N_REGIONS_NS) THEN + CALL ABORT_TRANS('TRANS_INQ: KPTRFRSTLAT TOO SMALL') + ELSE + KPTRFRSTLAT(1:N_REGIONS_NS) = D%NPTRFRSTLAT(:) + ENDIF +ENDIF + +IF(PRESENT(KPTRLSTLAT)) THEN + IF(UBOUND(KPTRLSTLAT,1) < N_REGIONS_NS) THEN + CALL ABORT_TRANS('TRANS_INQ: KPTRLSTLAT TOO SMALL') + ELSE + KPTRLSTLAT(1:N_REGIONS_NS) = D%NPTRLSTLAT(:) + ENDIF +ENDIF + +IF(PRESENT(KSTA)) THEN + IF(UBOUND(KSTA,1) < R%NDGL+N_REGIONS_NS-1) THEN + CALL ABORT_TRANS('TRANS_INQ: KSTA DIM 1 TOO SMALL') + ELSEIF(UBOUND(KSTA,2) < N_REGIONS_EW) THEN + CALL ABORT_TRANS('TRANS_INQ: KSTA DIM 2 TOO SMALL') + ELSE + KSTA(1:R%NDGL+N_REGIONS_NS-1,1:N_REGIONS_EW) = D%NSTA(:,:) + ENDIF +ENDIF + +IF(PRESENT(KONL)) THEN + IF(UBOUND(KONL,1) < R%NDGL+N_REGIONS_NS-1) THEN + CALL ABORT_TRANS('TRANS_INQ: KONL DIM 1 TOO SMALL') + ELSEIF(UBOUND(KONL,2) < N_REGIONS_EW) THEN + CALL ABORT_TRANS('TRANS_INQ: KONL DIM 2 TOO SMALL') + ELSE + KONL(1:R%NDGL+N_REGIONS_NS-1,1:N_REGIONS_EW) = D%NONL(:,:) + ENDIF +ENDIF + +IF(PRESENT(LDSPLITLAT)) THEN + IF(UBOUND(LDSPLITLAT,1) < R%NDGL) THEN + CALL ABORT_TRANS('TRANS_INQ: LDSPLITLAT TOO SMALL') + ELSE + LDSPLITLAT(1:R%NDGL) = D%LSPLITLAT(:) + ENDIF +ENDIF + +IF(PRESENT(KULTPP)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KULTPP REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KULTPP,1) < NPRTRNS) THEN + CALL ABORT_TRANS('TRANS_INQ: KULTPP TOO SMALL') + ELSE + KULTPP(1:NPRTRNS) = D%NULTPP(:) + ENDIF +ENDIF + +IF(PRESENT(KPTRLS)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KPTRLS REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KPTRLS,1) < NPRTRNS) THEN + CALL ABORT_TRANS('TRANS_INQ: KPTRLS TOO SMALL') + ELSE + KPTRLS(1:NPRTRNS) = D%NPTRLS(:) + ENDIF +ENDIF + +IF(PRESENT(KNMENG)) THEN + IF(UBOUND(KNMENG,1) < R%NDGL) THEN + CALL ABORT_TRANS('TRANS_INQ: KNMENG TOO SMALL') + ELSE + KNMENG(1:R%NDGL) = G%NMEN(1:R%NDGL) + ENDIF +ENDIF + +IF(PRESENT(PMU)) THEN + IF(UBOUND(PMU,1) < R%NDGL) THEN + CALL ABORT_TRANS('TRANS_INQ: PMU TOO SMALL') + ELSE + PMU(1:R%NDGL) = F%RMU + ENDIF +ENDIF + +IF(PRESENT(PGW)) THEN + IF(UBOUND(PGW,1) < R%NDGL) THEN + CALL ABORT_TRANS('TRANS_INQ: PGW TOO SMALL') + ELSE + PGW(1:R%NDGL) = REAl(F%RW,JPRB) + ENDIF +ENDIF + +IF(PRESENT(PRPNM)) THEN + + IF( .NOT. S%LKEEPRPNM ) THEN + CALL ABORT_TRANS('TRANS_INQ: PRPNM REQUIRED BUT S%LKEEPRPNM=F') + ENDIF + + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: PRPNM REQUIRED BUT LGRIDONLY=T') + ENDIF + IU1 = UBOUND(PRPNM,1) + IU2 = UBOUND(PRPNM,2) + IF(IU1 < R%NDGNH) THEN + CALL ABORT_TRANS('TRANS_INQ:FIRST DIM. OF PRNM TOO SMALL') + ELSE +! IU1 = MIN(IU1,R%NLEI3) +! IU2 = MIN(IU2,D%NSPOLEGL) +! PRPNM(1:IU1,1:IU2) = F%RPNM(1:IU1,1:IU2) + + DO JMLOC=1,D%NUMP,NPRTRV + IPRTRV=MIN(NPRTRV,D%NUMP-JMLOC+1) + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ISL = MAX(R%NDGNH-G%NDGLU(IM)+1,1) + IA = 1+MOD(R%NSMAX-IM+2,2) + ILA = (R%NSMAX-IM+2)/2 + IS = 1+MOD(R%NSMAX-IM+1,2) + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + DO J=1,ILA + DO I=1,IDGLU + PRPNM(ISL+I-1,D%NPMS(IM)+IA+(J-1)*2) = REAL(S%FA(IMLOC)%RPNMA(I,J),JPRB) + ENDDO + ENDDO + DO J=1,ILS + DO I=1,IDGLU + PRPNM(ISL+I-1,D%NPMS(IM)+IS+(J-1)*2) = REAL(S%FA(IMLOC)%RPNMS(I,J),JPRB) + ENDDO + ENDDO + ENDDO + ENDDO + ENDIF +ENDIF +IF(PRESENT(KLEI3)) THEN + KLEI3=R%NLEI3 +ENDIF +IF(PRESENT(KSPOLEGL)) THEN + KSPOLEGL=D%NSPOLEGL +ENDIF +IF(PRESENT(KPMS)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KPMS REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(KPMS,1) < R%NSMAX) THEN + CALL ABORT_TRANS('TRANS_INQ: KPMS TOO SMALL') + ELSE + KPMS(0:R%NSMAX) = D%NPMS(0:R%NSMAX) + ENDIF +ENDIF + +IF(PRESENT(KSMAX)) KSMAX = R%NSMAX +IF(PRESENT(PLAPIN)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: PLAPIN REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(UBOUND(PLAPIN,1) < R%NSMAX+2) THEN + CALL ABORT_TRANS('TRANS_INQ: PLAPIN TOO SMALL') + ELSEIF (LBOUND(PLAPIN,1) /= -1) THEN + CALL ABORT_TRANS('TRANS_INQ: LOWER BOUND OF PLAPIN SHOULD BE -1') + ELSE + PLAPIN(-1:R%NSMAX+2) = REAL(F%RLAPIN(:),JPRB) + ENDIF +ENDIF +IF(PRESENT(KNVALUE)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_INQ: KNVALUE REQUIRED BUT LGRIDONLY=T') + ENDIF + IF(SIZE(KNVALUE) < D%NSPEC2) THEN + CALL ABORT_TRANS('TRANS_INQ: KNVALUE TOO SMALL') + ELSE + IC=1 + DO JMLOC=1,D%NUMP + DO JN=D%MYMS(JMLOC),R%NSMAX + KNVALUE(IC )=JN + KNVALUE(IC+1)=JN + IC=IC+2 + ENDDO + ENDDO + ENDIF +ENDIF + + +IF(PRESENT(KDGLU)) THEN + IF(UBOUND(KDGLU,1) < R%NSMAX) THEN + CALL ABORT_TRANS('TRANS_INQ: KDGLU TOO SMALL') + ELSE + KDGLU(0:R%NSMAX) = G%NDGLU(0:R%NSMAX) + ENDIF +ENDIF +! ------------------------------------------------------------------ + +!endif INTERFACE + +END SUBROUTINE TRANS_INQ diff --git a/src/trans/gpu/external/trans_pnm.F90 b/src/trans/gpu/external/trans_pnm.F90 new file mode 100755 index 00000000..c8ccd0c0 --- /dev/null +++ b/src/trans/gpu/external/trans_pnm.F90 @@ -0,0 +1,200 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE TRANS_PNM(KRESOL,KM,PRPNM,LDTRANSPOSE,LDCHEAP) + +!**** *TRANS_PNM* - Compute Legendre polynomials for a given wavenember + +! Purpose. +! -------- +! Interface routine for computing Legendre polynomials for a given wavenember + +!** Interface. +! ---------- +! CALL TRANS_PNM(...) + +! Explicit arguments : All arguments are optional. +! -------------------- +! KRESOL - resolution tag for which info is required ,default is the +! first defined resulution (input) +! KM - wave number +! PRPNM - Legendre polynomials +! LDTRANSPOSE - Legendre polynomials array is transposed +! LDCHEAP - cheapest but less accurate computation + +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- + +! Author. +! ------- +! R. El Khatib *METEO-FRANCE* + +! Modifications. +! -------------- +! Original : 22-Jan-2016 from G. Mozdzynski's getpnm + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPRD, JPIM +USE PARKIND_ECTRANS,ONLY : JPRBT + +!ifndef INTERFACE + +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D +USE TPM_GEOMETRY ,ONLY : G +USE TPM_FIELDS ,ONLY : F +USE TPM_FLT ,ONLY : S + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE TPM_POL +USE SUPOLF_MOD + +!endif INTERFACE + +IMPLICIT NONE + +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +INTEGER(KIND=JPIM) ,INTENT(IN) :: KM +REAL(KIND=JPRBT) ,OPTIONAL, INTENT(OUT) :: PRPNM(:,:) +LOGICAL, OPTIONAL, INTENT(IN) :: LDTRANSPOSE +LOGICAL, OPTIONAL, INTENT(IN) :: LDCHEAP + +!ifndef INTERFACE + +INTEGER(KIND=JPIM) :: IU1, IU2, IMAXN, INMAX, ICHEAP_SYM, ICHEAP_ANTISYM +INTEGER(KIND=JPIM) :: IC, JN, JMLOC, JGL, JI +INTEGER(KIND=JPIM) :: IA, IS, IDGLU, ILA, ILS, ISL +REAL(KIND=JPRD), ALLOCATABLE :: ZLPOL(:) +LOGICAL :: LLTRANSPOSE, LLCHEAP +! ------------------------------------------------------------------ + +! Set current resolution +IF (PRESENT(KRESOL)) THEN + CALL SET_RESOL(KRESOL) +ENDIF + +IF (PRESENT(LDTRANSPOSE)) THEN + LLTRANSPOSE=LDTRANSPOSE +ELSE + LLTRANSPOSE=.FALSE. +ENDIF + +IF (PRESENT(LDCHEAP)) THEN + LLCHEAP=LDCHEAP +ELSE + LLCHEAP=.FALSE. +ENDIF +IF (LLCHEAP) THEN + ICHEAP_SYM =2 + ICHEAP_ANTISYM=3 +ELSE + ICHEAP_SYM =1 + ICHEAP_ANTISYM=1 +ENDIF + +IF (PRESENT(PRPNM)) THEN + IF(D%LGRIDONLY) THEN + CALL ABORT_TRANS('TRANS_PNM: PRPNM REQUIRED BUT LGRIDONLY=T') + ENDIF +ENDIF + +IU1 = UBOUND(PRPNM,1) +IU2 = UBOUND(PRPNM,2) + +IF (LLTRANSPOSE) THEN + + IF(IU2 < R%NLEI3) THEN + CALL ABORT_TRANS('TRANS_PNM : FIRST DIM. OF PRPNM TOO SMALL') + ENDIF + IF(IU1 < R%NTMAX-KM+3) THEN + CALL ABORT_TRANS('TRANS_PNM : SECOND DIM. OF PRPNM TOO SMALL') + ENDIF + + IF (IU2 >= R%NLEI3) THEN + PRPNM(:,R%NLEI3) = 0.0_JPRBT + ENDIF + +ELSE + + IF(IU1 < R%NLEI3) THEN + CALL ABORT_TRANS('TRANS_PNM : FIRST DIM. OF PRPNM TOO SMALL') + ENDIF + IF(IU2 < R%NTMAX-KM+3) THEN + CALL ABORT_TRANS('TRANS_PNM : SECOND DIM. OF PRPNM TOO SMALL') + ENDIF + + IF (IU1 >= R%NLEI3) THEN + PRPNM(R%NLEI3,:) = 0.0_JPRBT + ENDIF + +ENDIF + +ILA = (R%NTMAX-KM+2)/2 +ILS = (R%NTMAX-KM+3)/2 + +CALL INI_POL(R%NTMAX+2,LDFAST=.TRUE.) + +IMAXN=R%NTMAX+1 + +IA = 1+MOD(R%NTMAX-KM+2,2) +IS = 1+MOD(R%NTMAX-KM+1,2) + +ISL = MAX(R%NDGNH-G%NDGLU(KM)+1,1) +IF (S%LSOUTHPNM) THEN + IDGLU = 2*MIN(R%NDGNH,G%NDGLU(KM)) +ELSE + IDGLU = MIN(R%NDGNH,G%NDGLU(KM)) +ENDIF + +IF(MOD(IMAXN-KM,2) == 0) THEN + INMAX=IMAXN+1 +ELSE + INMAX=IMAXN +ENDIF + +ALLOCATE(ZLPOL(0:R%NTMAX+2)) + +!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,ZLPOL,JI,JN) +DO JGL=1,IDGLU + CALL SUPOLF(KM,INMAX,REAL (F%RMU(ISL+JGL-1), JPRD),ZLPOL(0:INMAX),KCHEAP=ICHEAP_ANTISYM) + IF (LLTRANSPOSE) THEN + DO JI=1,ILA + PRPNM(IA+(JI-1)*2,ISL+JGL-1) = ZLPOL(KM+2*(ILA-JI)+1) + ENDDO + ELSE + DO JI=1,ILA + PRPNM(ISL+JGL-1,IA+(JI-1)*2) = ZLPOL(KM+2*(ILA-JI)+1) + ENDDO + ENDIF + CALL SUPOLF(KM,INMAX,REAL (F%RMU(ISL+JGL-1), JPRD),ZLPOL(0:INMAX),KCHEAP=ICHEAP_SYM) + IF (LLTRANSPOSE) THEN + DO JI=1,ILS + PRPNM(IS+(JI-1)*2,ISL+JGL-1) = ZLPOL(KM+2*(ILS-JI)) + ENDDO + ELSE + DO JI=1,ILS + PRPNM(ISL+JGL-1,IS+(JI-1)*2) = ZLPOL(KM+2*(ILS-JI)) + ENDDO + ENDIF +ENDDO +!$OMP END PARALLEL DO + +CALL END_POL + +! ------------------------------------------------------------------ + +!endif INTERFACE + +END SUBROUTINE TRANS_PNM diff --git a/src/trans/gpu/external/trans_release.F90 b/src/trans/gpu/external/trans_release.F90 new file mode 100755 index 00000000..ea97b3cf --- /dev/null +++ b/src/trans/gpu/external/trans_release.F90 @@ -0,0 +1,61 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE TRANS_RELEASE(KRESOL) + +!**** *TRANS_RELEASE* - release a spectral resolution + +! Purpose. +! -------- +! Release all arrays related to a given resolution tag + +!** Interface. +! ---------- +! CALL TRANS_RELEASE + +! Explicit arguments : KRESOL : resolution tag +! -------------------- + +! Method. +! ------- + +! Externals. None +! ---------- + +! Author. +! ------- +! R. El Khatib *METEO-FRANCE* + +! Modifications. +! -------------- +! Original : 09-Jul-2013 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM + +!ifndef INTERFACE + +USE DEALLOC_RESOL_MOD ,ONLY : DEALLOC_RESOL +! + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(IN) :: KRESOL + +!endif INTERFACE + +! ------------------------------------------------------------------ + +CALL DEALLOC_RESOL(KRESOL) + +! ------------------------------------------------------------------ + +END SUBROUTINE TRANS_RELEASE diff --git a/src/trans/gpu/external/vordiv_to_uv.F90 b/src/trans/gpu/external/vordiv_to_uv.F90 new file mode 100755 index 00000000..7aa8342c --- /dev/null +++ b/src/trans/gpu/external/vordiv_to_uv.F90 @@ -0,0 +1,179 @@ +! (C) Copyright 2015- ECMWF. +! (C) Copyright 2015- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +SUBROUTINE VORDIV_TO_UV(PSPVOR,PSPDIV,PSPU,PSPV,KSMAX,KVSETUV) + +!**** *VORDIV_TO_UV* - Convert spectral vorticity and divergence to spectral U (u*cos(theta)) and V (v*cos(theta). + +! Purpose. +! -------- +! Interface routine for Convert spectral vorticity and divergence to spectral U and V + +!** Interface. +! ---------- +! CALL VORDIV_TO_UV(...) + +! Explicit arguments : +! -------------------- +! PSPVOR(:,:) - spectral vorticity (input) +! PSPDIV(:,:) - spectral divergence (input) +! PSPU(:,:) - spectral U (u*cos(theta) (output) +! PSPV(:,:) - spectral V (v*cos(theta) (output) +! KSMAX - spectral resolution (input) +! KVSETUV(:) - Optionally indicating which 'b-set' in spectral space owns a +! vor/div field. Equivalant to NBSETLEV in the IFS. +! The length of KVSETUV should be the GLOBAL number +! of u/v fields which is the dimension of u and v releated +! fields in grid-point space. + +! Method. +! ------- + +! Externals. SET_RESOL - set resolution +! ---------- VD2UV_CTL - control vordiv to uv + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 15-06-15 + + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!ifndef INTERFACE + +USE TPM_GEN ,ONLY : NERR, NOUT,MSETUP0 +USE TPM_DISTR ,ONLY : D, NPRTRV, MYSETV + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +USE VD2UV_CTL_MOD ,ONLY : VD2UV_CTL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + +!endif INTERFACE + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB), INTENT(IN) :: PSPVOR(:,:) +REAL(KIND=JPRB), INTENT(IN) :: PSPDIV(:,:) +REAL(KIND=JPRB), INTENT(OUT) :: PSPU(:,:) +REAL(KIND=JPRB), INTENT(OUT) :: PSPV(:,:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KSMAX +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) + +!ifndef INTERFACE + +! Local varaibles +INTEGER(KIND=JPIM) :: IUBOUND(4),J +INTEGER(KIND=JPIM) :: IF_UV,IF_UV_G,IRESOL,IDGL +LOGICAL :: LTMP_SETUP0 +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +#include "setup_trans0.h" +#include "setup_trans.h" +#include "trans_release.h" +#include "trans_end.h" + +! ------------------------------------------------------------------ + +IF (LHOOK) CALL DR_HOOK('VORDIV_TO_UV',0,ZHOOK_HANDLE) + +!CALL GSTATS(XXXX,0) + +IF(MSETUP0 == 0) THEN + CALL SETUP_TRANS0() + LTMP_SETUP0 = .TRUE. +ELSE + LTMP_SETUP0 = .FALSE. +ENDIF +IDGL = 2 ! It doesn't matter as long as it's a positive even number +CALL SETUP_TRANS(KSMAX,IDGL,LDSPSETUPONLY=.TRUE.,KRESOL=IRESOL) +CALL SET_RESOL(IRESOL) + + +! Set defaults + +IF_UV = 0 +IF_UV_G = 0 +! Decide requirements + +IF(PRESENT(KVSETUV)) THEN + IF_UV_G = UBOUND(KVSETUV,1) + DO J=1,IF_UV_G + IF(KVSETUV(J) > NPRTRV .OR. KVSETUV(J) < 1) THEN + WRITE(NERR,*) 'VORDIV_TO_UV:KVSETUV(J) > NPRTRV ',J,KVSETUV(J),NPRTRV + CALL ABORT_TRANS('VORDIV_TO_UV:KVSETUV TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') + ENDIF + IF(KVSETUV(J) == MYSETV) THEN + IF_UV = IF_UV+1 + ENDIF + ENDDO +ELSE + IF_UV = UBOUND(PSPVOR,1) + IF_UV_G = IF_UV +ENDIF + +! Consistency checks + +IF (IF_UV > 0) THEN + IF(UBOUND(PSPVOR,1) < IF_UV) THEN + WRITE(NERR,*)'VORDIV_TO_UV : UBOUND(PSPVOR,1) < IF_UV ',UBOUND(PSPVOR,1),IF_UV + CALL ABORT_TRANS('VORDIV_TO_UV : PSPVOR TOO SHORT') + ENDIF + IF(UBOUND(PSPDIV,1) < IF_UV) THEN + WRITE(NERR,*)'VORDIV_TO_UV : UBOUND(PSPDIV,1) < IF_UV ',UBOUND(PSPDIV,1),IF_UV + CALL ABORT_TRANS('VORDIV_TO_UV : PSPDIV TOO SHORT') + ENDIF + IF(UBOUND(PSPU,1) < IF_UV) THEN + WRITE(NERR,*)'VORDIV_TO_UV : UBOUND(PSPU,1) < IF_UV ',UBOUND(PSPU,1),IF_UV + CALL ABORT_TRANS('VORDIV_TO_UV : PSPU TOO SHORT') + ENDIF + IF(UBOUND(PSPV,1) < IF_UV) THEN + WRITE(NERR,*)'VORDIV_TO_UV : UBOUND(PSPV,1) < IF_UV ',UBOUND(PSPV,1),IF_UV + CALL ABORT_TRANS('VORDIV_TO_UV : PSPV TOO SHORT') + ENDIF +ENDIF + + +IF(NPRTRV >1) THEN + IF(IF_UV > 0 .AND. .NOT. PRESENT(KVSETUV)) THEN + WRITE(NERR,*)'NPRTRV >1 AND IF_UV > 0 AND NOT PRESENT(KVSETUV)',& + &NPRTRV,IF_UV + CALL ABORT_TRANS('VORDIV_TO_UV: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') + ENDIF +ENDIF + +!CALL GSTATS(XXXX,1) + +! ------------------------------------------------------------------ + +! Perform transform + +CALL VD2UV_CTL(IF_UV,PSPVOR,PSPDIV,PSPU,PSPV) + +CALL TRANS_RELEASE(IRESOL) +IF (LTMP_SETUP0) THEN + CALL TRANS_END() +ENDIF + +IF (LHOOK) CALL DR_HOOK('VORDIV_TO_UV',1,ZHOOK_HANDLE) +! ------------------------------------------------------------------ + +!endif INTERFACE + +END SUBROUTINE VORDIV_TO_UV + diff --git a/src/trans/gpu/internal/abort_trans_mod.F90 b/src/trans/gpu/internal/abort_trans_mod.F90 new file mode 100755 index 00000000..b92131d1 --- /dev/null +++ b/src/trans/gpu/internal/abort_trans_mod.F90 @@ -0,0 +1,39 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE ABORT_TRANS_MOD +CONTAINS +SUBROUTINE ABORT_TRANS(CDTEXT) + +USE TPM_GEN ,ONLY : NOUT,NERR +USE TPM_DISTR ,ONLY : NPROC,MYPROC +USE MPL_MODULE ,ONLY : MPL_ABORT +USE SDL_MOD ,ONLY : SDL_TRACEBACK, SDL_SRLABORT + +IMPLICIT NONE + + +CHARACTER(LEN=*),INTENT(IN) :: CDTEXT + +WRITE(NOUT,'(1X,A)') 'ABORT_TRANS CALLED' + +WRITE(NOUT,'(1X,A)') CDTEXT +WRITE(NERR,'(1X,A,1X,I3,1X,A)') 'ABORT! ',MYPROC,CDTEXT +CLOSE(NOUT) +IF (NPROC > 1) THEN + CALL MPL_ABORT(CDTEXT) +ELSE + CALL SDL_TRACEBACK + CALL FLUSH(0) + CALL SDL_SRLABORT +ENDIF + +END SUBROUTINE ABORT_TRANS +END MODULE ABORT_TRANS_MOD diff --git a/src/trans/gpu/internal/buffered_allocator_mod.F90 b/src/trans/gpu/internal/buffered_allocator_mod.F90 new file mode 100644 index 00000000..86149ea5 --- /dev/null +++ b/src/trans/gpu/internal/buffered_allocator_mod.F90 @@ -0,0 +1,186 @@ +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +MODULE BUFFERED_ALLOCATOR_MOD + + USE PARKIND_ECTRANS ,ONLY : JPIM + USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + USE ISO_C_BINDING ,ONLY : C_INT8_T, C_SIZE_T, C_LOC, C_F_POINTER + USE GROWING_ALLOCATOR_MOD + + IMPLICIT NONE + + PRIVATE + PUBLIC :: BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE, RESERVE, ASSIGN_PTR, GET_ALLOCATION + PUBLIC :: MAKE_BUFFERED_ALLOCATOR, INSTANTIATE_ALLOCATOR + + ! The buffered allocator uses double buffering. The idea is that the allocator + ! iterates through its two buffers, and each allocate returns one or the other + ! buffer. It is a two-step allocator - it expects you to create reservation + ! handles first for all allocations. Then the allocator is instantiated (i.e. + ! the buffers are actually allocated). Instantiation will do an allocation + ! that is large enough two hold all consecutive allocations. Other allocations + ! might be overwritten (like you can't access the allocation done two steps + ! before). + ! After instantiation, you can retrieve your buffers by passing the allocator + ! and the handles to GET_ALLOCATION. Also, we provide helper function + ! ASSIGN_PTR, because an allocation is often split among several "sub-buffers", + ! so you can for example assign the first half of an allocation to one + ! buffer, while the second half to another buffer. + ! If you see "Logical errors" that usually means you try to retrieve a buffer + ! that is not within the reserved allocation size. This might be a valid + ! region in the sense that it is physically allocated, but it might be part of + ! the double buffer. + + + INTEGER(KIND=JPIM), PARAMETER :: NBUF = 2 + TYPE BUFFERED_ALLOCATOR + INTEGER(KIND=C_SIZE_T) :: BUFR_SZ(0:NBUF-1) + INTEGER(KIND=JPIM) :: NEXT_BUF + TYPE(GROWING_ALLOCATION_TYPE), POINTER :: PTR + END TYPE + TYPE ALLOCATION_RESERVATION_HANDLE + INTEGER(KIND=C_SIZE_T) :: SZ + INTEGER(KIND=JPIM) :: BUF + END TYPE + + INTERFACE ASSIGN_PTR + MODULE PROCEDURE ASSIGN_PTR_FLOAT, ASSIGN_PTR_DOUBLE + END INTERFACE + +CONTAINS + + ! TODO This is not perfect yet. We will over-allocate up to 2X in theory. + ! It would be better to always keep the previous allocation size and then + ! have one allocation sitting at the the top, and the double-buffer at + ! the bottom of the allocation. + + FUNCTION MAKE_BUFFERED_ALLOCATOR() + IMPLICIT NONE + TYPE(BUFFERED_ALLOCATOR) :: MAKE_BUFFERED_ALLOCATOR + + MAKE_BUFFERED_ALLOCATOR%BUFR_SZ(:) = 0 + MAKE_BUFFERED_ALLOCATOR%NEXT_BUF = 0 + END FUNCTION MAKE_BUFFERED_ALLOCATOR + + FUNCTION RESERVE(ALLOCATOR, SZ) + IMPLICIT NONE + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=C_SIZE_T), INTENT(IN) :: SZ + + TYPE(ALLOCATION_RESERVATION_HANDLE) :: RESERVE + + ALLOCATOR%BUFR_SZ(ALLOCATOR%NEXT_BUF) = MAX(ALLOCATOR%BUFR_SZ(ALLOCATOR%NEXT_BUF),SZ) + RESERVE%BUF = ALLOCATOR%NEXT_BUF + RESERVE%SZ = SZ + + ALLOCATOR%NEXT_BUF = MOD(ALLOCATOR%NEXT_BUF+1,NBUF) + END FUNCTION RESERVE + + SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) + IMPLICIT NONE + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + !!TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: GROWING_ALLOCATION + TYPE(GROWING_ALLOCATION_TYPE), target, INTENT(INout) :: GROWING_ALLOCATION + INTEGER :: I + + DO I = 0, NBUF-1 + ALLOCATOR%BUFR_SZ(I) = ALIGN(ALLOCATOR%BUFR_SZ(I),128) + ENDDO + ALLOCATOR%PTR => GROWING_ALLOCATION + + CALL REALLOCATE_GROWING_ALLOCATION(GROWING_ALLOCATION, SUM(ALLOCATOR%BUFR_SZ)) + END SUBROUTINE + + FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION) + IMPLICIT NONE + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(ALLOCATION_RESERVATION_HANDLE), INTENT(IN) :: RESERVATION + + INTEGER(KIND=C_INT8_T), POINTER :: GET_ALLOCATION(:) + + IF (RESERVATION%SZ > ALLOCATOR%BUFR_SZ(RESERVATION%BUF)) THEN + CALL ABORT_TRANS( "Logical Error in GET_ALLOCATION") + ENDIF + IF (RESERVATION%BUF == 0) THEN + GET_ALLOCATION(1:) => ALLOCATOR%PTR%PTR(1:RESERVATION%SZ) + ELSE + GET_ALLOCATION(1:) => ALLOCATOR%PTR%PTR(SUM(ALLOCATOR%BUFR_SZ(0:RESERVATION%BUF-1))+1: & + SUM(ALLOCATOR%BUFR_SZ(0:RESERVATION%BUF-1))+RESERVATION%SZ) + ENDIF + END FUNCTION GET_ALLOCATION + + SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM) + USE ISO_C_BINDING + USE OPENACC, ONLY: ACC_ASYNC_SYNC + IMPLICIT NONE + INTEGER(KIND=C_INT8_T), POINTER, INTENT(INOUT) :: SRC(:) + REAL(KIND=C_FLOAT), POINTER, INTENT(OUT) :: DST(:) + LOGICAL, INTENT(IN), OPTIONAL :: SET_VALUE + INTEGER(KIND=4), INTENT(IN), OPTIONAL :: SET_STREAM + LOGICAL :: SET_VALUE_EFF + INTEGER(KIND=4) :: SET_STREAM_EFF + INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + IF (START_IN_BYTES + LENGTH_IN_BYTES - 1 > SIZE(SRC, KIND=C_SIZE_T)) THEN + CALL ABORT_TRANS("Logical Error in ASSIGN_PTR - OOB assignment") + ENDIF + IF (PRESENT(SET_VALUE)) THEN + SET_VALUE_EFF = SET_VALUE + ELSE + SET_VALUE_EFF = .FALSE. + ENDIF + IF (PRESENT(SET_STREAM)) THEN + SET_STREAM_EFF = SET_STREAM + ELSE + SET_STREAM_EFF = ACC_ASYNC_SYNC + ENDIF + IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN + ! This option is turned off by default, but for experimentation we can turn it on. This is + ! setting all bits to 1 (meaning NaN in floating point) + !$ACC KERNELS PRESENT(SRC) ASYNC(SET_STREAM_EFF) + SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1) = -1 + !$ACC END KERNELS!! LOOP + ENDIF + CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & + & [SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/SIZEOF(DST(0))]) + END SUBROUTINE ASSIGN_PTR_FLOAT + SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM) + USE ISO_C_BINDING + USE OPENACC, ONLY: ACC_ASYNC_SYNC + IMPLICIT NONE + INTEGER(KIND=C_INT8_T), POINTER, INTENT(INOUT) :: SRC(:) + REAL(KIND=C_DOUBLE), POINTER, INTENT(OUT) :: DST(:) + LOGICAL, INTENT(IN), OPTIONAL :: SET_VALUE + INTEGER(KIND=4), INTENT(IN), OPTIONAL :: SET_STREAM + LOGICAL :: SET_VALUE_EFF + INTEGER(KIND=4) :: SET_STREAM_EFF + INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + IF (START_IN_BYTES + LENGTH_IN_BYTES - 1 > SIZE(SRC, KIND=C_SIZE_T)) THEN + CALL ABORT_TRANS("Logical Error in ASSIGN_PTR - OOB assignment") + ENDIF + IF (PRESENT(SET_VALUE)) THEN + SET_VALUE_EFF = SET_VALUE + ELSE + SET_VALUE_EFF = .FALSE. + ENDIF + IF (PRESENT(SET_STREAM)) THEN + SET_STREAM_EFF = SET_STREAM + ELSE + SET_STREAM_EFF = ACC_ASYNC_SYNC + ENDIF + IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN + ! This option is turned off by default, but for experimentation we can turn it on. This is + ! setting all bits to 1 (meaning NaN in floating point) + !$ACC KERNELS PRESENT(SRC) ASYNC(SET_STREAM_EFF) + SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1) = -1 + !$ACC END KERNELS!! LOOP + ENDIF + CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & + & [SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/SIZEOF(DST(0))]) + END SUBROUTINE ASSIGN_PTR_DOUBLE +END MODULE diff --git a/src/trans/gpu/internal/cdmap_mod.F90 b/src/trans/gpu/internal/cdmap_mod.F90 new file mode 100755 index 00000000..10648a06 --- /dev/null +++ b/src/trans/gpu/internal/cdmap_mod.F90 @@ -0,0 +1,178 @@ +! (C) Copyright 2014- ECMWF. +! (C) Copyright 2014- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE CDMAP_MOD +CONTAINS +SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& +& KFIELDS, PCOEFA, PCOEFS) + +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRB, JPRBT +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +USE TPM_FLT +USE TPM_GEOMETRY +USE TPM_DISTR ,ONLY : D +USE TPM_TRANS ,ONLY : FOUBUF_IN, FOUBUF +USE SEEFMM_MIX + +!**** *CDMAP* - REMAP ROOTS +! +! Purpose. +! -------- +! remap from one set of roots to another using Christoffel-Darboux formula, see Chien + Alpert, 1997. + +!** Interface. +! ---------- +! *CALL* *CDMAP(...) + +! Explicit arguments : +! -------------------- +! KM - zonal wavenumber +! KMLOC - local zonal wavenumber +! +! Method. +! ------- + +! Externals. +! ---------- + +! Reference. +! ---------- +! Chien + Alpert, 1997. + +! Author. +! ------- +! Nils Wedi *ECMWF* + +! Modifications. +! -------------- +! Original : 14-05-14 +! ------------------------------------------------------------------ + +IMPLICIT NONE + + +INTEGER(KIND=JPIM), INTENT(IN) :: KM +INTEGER(KIND=JPIM), INTENT(IN) :: KMLOC +INTEGER(KIND=JPIM), INTENT(IN) :: KSL +INTEGER(KIND=JPIM), INTENT(IN) :: KSLO +REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM +INTEGER(KIND=JPIM), INTENT(IN) :: KDIR ! direction of map +INTEGER(KIND=JPIM), INTENT(IN) :: KDGNH +INTEGER(KIND=JPIM), INTENT(IN) :: KDGNHD +INTEGER(KIND=JPIM), INTENT(IN) :: KFIELDS +REAL(KIND=JPRBT), INTENT(INOUT) :: PCOEFA(:,:) +REAL(KIND=JPRBT), INTENT(INOUT) :: PCOEFS(:,:) + +INTEGER(KIND=JPIM) :: JGL, IGL, JF +REAL(KIND=JPRBT), ALLOCATABLE :: ZALL(:,:), ZQX(:,:) +REAL(KIND=JPRBT), ALLOCATABLE :: ZALL1(:,:), ZQY(:,:) +INTEGER(KIND=JPIM) :: ISTN(KDGNH), ISTS(KDGNH) + +INTEGER(KIND=JPIM) :: IGLS, IPROC, IPROCS, IEND, IENDO + +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +! ------------------------------------------------------------------ + +!* 1. PERFORM LEGENDRE TRANFORM. +! -------------------------- + +IF (LHOOK) CALL DR_HOOK('CDMAP_MOD',0,ZHOOK_HANDLE) + +IF( KDIR == -1 ) THEN + ! inverse map from internal (gg) roots to post-processing roots + + IENDO = 2*KDGNHD - KSLO + 1 + IEND = 2*KDGNH - KSL + 1 + + !!!!! fourier buffer setup in output latitudes, may not work if different from input !!!! + DO IGL=KSLO, KDGNHD + IPROC = D%NPROCL(IGL) + ISTN(IGL) = (D%NSTAGT0B(IPROC) + D%NPNTGTB1(KMLOC,IGL))*KFIELDS + IGLS = 2*KDGNH+1-IGL + IPROCS = D%NPROCL(IGLS) + ISTS(IGL) = (D%NSTAGT0B(IPROCS) + D%NPNTGTB1(KMLOC,IGLS))*KFIELDS + ENDDO + + ALLOCATE(ZALL(KFIELDS, 2*KDGNHD)) + ALLOCATE(ZALL1(KFIELDS, 2*KDGNHD)) + ALLOCATE(ZQX(KFIELDS, 2*KDGNH)) + ALLOCATE(ZQY(KFIELDS, 2*KDGNH)) + ZQX(:,1:KSL) = 0._JPRBT + ZQX(:,IEND:2*KDGNH) = 0._JPRBT + ZQY(:,1:KSL) = 0._JPRBT + ZQY(:,IEND:2*KDGNH) = 0._JPRBT + DO JGL=KSL, IEND + ZQX(1:KFIELDS,JGL)=S%FA(KMLOC)%RPNMWI(JGL-KSL+1,1)*PCOEFA(1:KFIELDS,JGL) + ZQY(1:KFIELDS,JGL)=S%FA(KMLOC)%RPNMWI(JGL-KSL+1,2)*PCOEFA(1:KFIELDS,JGL) + ENDDO + CALL SEEFMM_MULM(S%FMM_INTI,KFIELDS,1_JPIM,.TRUE.,ZQX,ZALL1) + CALL SEEFMM_MULM(S%FMM_INTI,KFIELDS,1_JPIM,.TRUE.,ZQY,ZALL) + DEALLOCATE(ZQX) + DEALLOCATE(ZQY) + ! minus sign comes from FMM ?! + ! fill buffer + DO IGL=KSLO,KDGNHD + IGLS = 2*KDGNHD+1-IGL + DO JF=1,KFIELDS + FOUBUF_IN(ISTN(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,1)*ZALL1(JF,IGL) & + & - S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,2)*ZALL(JF,IGL) + FOUBUF_IN(ISTS(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,1)*ZALL1(JF,IGLS) & + & - S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,2)*ZALL(JF,IGLS) + ENDDO + ENDDO + DEALLOCATE(ZALL1) + DEALLOCATE(ZALL) + +ELSE +! direct map from post-processing/input field roots to internal (gg) roots +! this assumes essentially a nearest neighbour interpolation in latitude +! a more accurate approach may be +! a local gridpoint interpolation of the input field to the target latitudes prior to the transforms + + IENDO = 2*KDGNHD - KSLO + 1 + IEND = 2*KDGNH - KSL + 1 + + !!!!! fourier buffer setup in input data latitudes, may not work if different from output !!!! + DO JGL=KSLO, KDGNHD + IPROC = D%NPROCL(JGL) + ISTN(JGL) = (D%NSTAGT1B(IPROC) + D%NPNTGTB1(KMLOC,JGL))*KFIELDS + IGLS = 2*KDGNHD+1-JGL + IPROCS = D%NPROCL(IGLS) + ISTS(JGL) = (D%NSTAGT1B(IPROCS) + D%NPNTGTB1(KMLOC,IGLS))*KFIELDS + ENDDO + + ALLOCATE( ZQX( KFIELDS, 2*KDGNHD)) + ZQX(:,1:KSLO) = 0._JPRBT + ZQX(:,IENDO:2*KDGNHD) = 0._JPRBT + DO JGL=KSLO, KDGNHD + IGLS = 2*KDGNHD+1-JGL + DO JF=1,KFIELDS + ZQX(JF,JGL)=FOUBUF(ISTN(JGL)+JF) + ZQX(JF,IGLS)=FOUBUF(ISTS(JGL)+JF) + ENDDO + ENDDO + + ! split into symmetric / antisymmetric + DO IGL=KSL,KDGNH + IGLS = 2*KDGNH+1-IGL + PCOEFS(1:KFIELDS,IGL) = ZQX(1:KFIELDS,IGL) + ZQX(1:KFIELDS,IGLS) + PCOEFA(1:KFIELDS,IGL) = ZQX(1:KFIELDS,IGL) - ZQX(1:KFIELDS,IGLS) + ENDDO + + DEALLOCATE(ZQX) + +ENDIF + +IF (LHOOK) CALL DR_HOOK('CDMAP_MOD',1,ZHOOK_HANDLE) +! ------------------------------------------------------------------ + +END SUBROUTINE CDMAP +END MODULE CDMAP_MOD diff --git a/src/trans/gpu/internal/cpledn_mod.F90 b/src/trans/gpu/internal/cpledn_mod.F90 new file mode 100755 index 00000000..17f7504c --- /dev/null +++ b/src/trans/gpu/internal/cpledn_mod.F90 @@ -0,0 +1,134 @@ +! (C) Copyright 1987- ECMWF. +! (C) Copyright 1987- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE CPLEDN_MOD +CONTAINS +SUBROUTINE CPLEDN(KN,KODD,PFN,PX,KFLAG,PW,PXN,PXMOD) + +!**** *CPLEDN* - Routine to perform a single Newton iteration step to find +! the zero of the ordinary Legendre polynomial of degree N + +! Purpose. +! -------- + +!** Interface. +! ---------- +! *CALL* *CPLEDN(KN,KDBLE,PX,KFLAG,PW,PXN,PXMOD)* + +! Explicit arguments : +! -------------------- +! KN : Degree of the Legendre polynomial (in) +! KODD : odd or even number of latitudes (in) +! PFN : Fourier coefficients of series expansion (in) +! for the ordinary Legendre polynomials +! PX : abcissa where the computations are performed (in) +! KFLAG : When KFLAG.EQ.1 computes the weights (in) +! PW : Weight of the quadrature at PXN (out) +! PXN : new abscissa (Newton iteration) (out) +! PXMOD : PXN-PX (out) + +! Implicit arguments : +! -------------------- +! None + +! Method. +! ------- +! See documentation + +! Externals. +! ---------- +! None + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! Mats Hamrud and Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 87-10-15 +! Michel Rochas, 90-08-30 (Lobatto+cleaning) +! K. Yessad (Sep 2008): cleaning, improve comments. +! Nils Wedi + Mats Hamrud, 2009-02-05 revised following Swarztrauber, 2002 +! F. Vana 05-Mar-2015 Support for single precision +! ------------------------------------------------------------------ + +USE EC_PARKIND ,ONLY : JPRD, JPIM + +! ------------------------------------------------------------------ + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(IN) :: KN +INTEGER(KIND=JPIM),INTENT(IN) :: KODD +REAL(KIND=JPRD),INTENT(IN) :: PFN(0:KN/2) +REAL(KIND=JPRD),INTENT(IN) :: PX +INTEGER(KIND=JPIM),INTENT(IN) :: KFLAG +REAL(KIND=JPRD),INTENT(OUT) :: PW +REAL(KIND=JPRD),INTENT(INOUT) :: PXN +REAL(KIND=JPRD),INTENT(OUT) :: PXMOD + +! ------------------------------------------------------------------ + +REAL(KIND=JPRD) :: ZDLX,ZDLK,ZDLLDN,ZDLXN,ZDLMOD + +INTEGER(KIND=JPIM), PARAMETER :: JPKD=KIND(PX) + +INTEGER(KIND=JPIM) :: JN, IK + +! ----------------------------------------------------------------- + +!* 1. NEWTON ITERATION STEP. +! ---------------------- + +ZDLX = PX + +ZDLK = 0.0_JPRD +IF( KODD==0 ) ZDLK=0.5_JPRD*PFN(0) +ZDLXN = 0.0_JPRD +ZDLLDN = 0.0_JPRD +IK=1 + +IF(KFLAG == 0)THEN + DO JN=2-KODD,KN,2 + ! normalised ordinary Legendre polynomial == \overbar{P_n}^0 + ZDLK = ZDLK + PFN(IK)*COS(REAL(JN,JPKD)*ZDLX) + ! normalised derivative == d/d\theta(\overbar{P_n}^0) + ZDLLDN = ZDLLDN - PFN(IK)*REAL(JN,JPKD)*SIN(REAL(JN,JPKD)*ZDLX) + IK=IK+1 + ENDDO + ! Newton method + ZDLMOD = -ZDLK/ZDLLDN + ZDLXN = ZDLX+ZDLMOD + PXN = ZDLXN + PXMOD = ZDLMOD +ENDIF + +! ------------------------------------------------------------------ + +!* 2. Computes weight. +! ---------------- + +IF(KFLAG == 1)THEN + DO JN=2-KODD,KN,2 + ! normalised derivative + ZDLLDN = ZDLLDN - PFN(IK)*REAL(JN,JPKD)*SIN(REAL(JN,JPKD)*ZDLX) + IK=IK+1 + ENDDO + PW = REAL(2*KN+1,JPKD)/ZDLLDN**2 +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE CPLEDN +END MODULE CPLEDN_MOD diff --git a/src/trans/gpu/internal/dealloc_resol_mod.F90 b/src/trans/gpu/internal/dealloc_resol_mod.F90 new file mode 100755 index 00000000..6a6bd20e --- /dev/null +++ b/src/trans/gpu/internal/dealloc_resol_mod.F90 @@ -0,0 +1,189 @@ +! (C) Copyright 2013- ECMWF. +! (C) Copyright 2013- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE DEALLOC_RESOL_MOD +CONTAINS +SUBROUTINE DEALLOC_RESOL(KRESOL) + +!**** *DEALLOC_RESOL* - Deallocations of a resolution + +! Purpose. +! -------- +! Release allocated arrays for a given resolution + +!** Interface. +! ---------- +! CALL DEALLOC_RESOL + +! Explicit arguments : KRESOL : resolution tag +! -------------------- + +! Method. +! ------- + +! Externals. None +! ---------- + +! Author. +! ------- +! R. El Khatib *METEO-FRANCE* + +! Modifications. +! -------------- +! Original : 09-Jul-2013 from trans_end + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM + +USE TPM_DIM ,ONLY : R +USE TPM_GEN ,ONLY : LENABLED, NOUT,NDEF_RESOL +USE TPM_DISTR ,ONLY : D,NPRTRV +USE TPM_GEOMETRY ,ONLY : G +USE TPM_FIELDS ,ONLY : F +USE TPM_FFT ,ONLY : T +USE TPM_FLT ,ONLY : S +USE TPM_CTL ,ONLY : C +USE TPM_HICFFT ,ONLY : DESTROY_ALL_PLANS_FFT +USE SEEFMM_MIX + +USE SET_RESOL_MOD ,ONLY : SET_RESOL +! + +IMPLICIT NONE + +INTEGER(KIND=JPIM), INTENT(IN) :: KRESOL +INTEGER(KIND=JPIM) :: JMLOC,IPRTRV,JSETV,IMLOC,IM,ILA,ILS, JRESOL + +! ------------------------------------------------------------------ + +IF (.NOT.LENABLED(KRESOL)) THEN + + WRITE(UNIT=NOUT,FMT='('' DEALLOC_RESOL WARNING : KRESOL = '',I3,'' ALREADY DISABLED '')') KRESOL + +ELSE + + CALL SET_RESOL(KRESOL) + + CALL DESTROY_ALL_PLANS_FFT() + + !TPM_FLT + IF( ALLOCATED(S%FA) ) THEN + DO JMLOC=1,D%NUMP,NPRTRV ! +++++++++++++++++++++ JMLOC LOOP ++++++++++ + IPRTRV=MIN(NPRTRV,D%NUMP-JMLOC+1) + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ILA = (R%NSMAX-IM+2)/2 + ILS = (R%NSMAX-IM+3)/2 + IF(.NOT.C%CIO_TYPE == 'mbuf' .AND. ASSOCIATED(S%FA(IMLOC)%RPNMA)) DEALLOCATE(S%FA(IMLOC)%RPNMA) + IF(.NOT.C%CIO_TYPE == 'mbuf' .AND. ASSOCIATED(S%FA(IMLOC)%RPNMS)) DEALLOCATE(S%FA(IMLOC)%RPNMS) + IF(S%LDLL) THEN + IF(.NOT.C%CIO_TYPE == 'mbuf' .AND. ASSOCIATED(S%FA(IMLOC)%RPNMWI)) DEALLOCATE(S%FA(IMLOC)%RPNMWI) + IF(.NOT.C%CIO_TYPE == 'mbuf' .AND. ASSOCIATED(S%FA(IMLOC)%RPNMWO)) DEALLOCATE(S%FA(IMLOC)%RPNMWO) + ENDIF + ENDDO + ENDDO + DEALLOCATE(S%FA) + ENDIF + IF(S%LDLL) THEN + CALL FREE_SEEFMM(S%FMM_INTI) + IF(ASSOCIATED(S%FMM_INTI)) DEALLOCATE(S%FMM_INTI) + ENDIF + + !TPM_DISTR + IF(ALLOCATED(D%NFRSTLAT)) DEALLOCATE(D%NFRSTLAT) + IF(ALLOCATED(D%NLSTLAT)) DEALLOCATE(D%NLSTLAT) + IF(ALLOCATED(D%NPTRLAT)) DEALLOCATE(D%NPTRLAT) + IF(ALLOCATED(D%NPTRFRSTLAT)) DEALLOCATE(D%NPTRFRSTLAT) + IF(ALLOCATED(D%NPTRLSTLAT)) DEALLOCATE(D%NPTRLSTLAT) + IF(ALLOCATED(D%LSPLITLAT)) DEALLOCATE(D%LSPLITLAT) + IF(ALLOCATED(D%NSTA)) DEALLOCATE(D%NSTA) + IF(ALLOCATED(D%NONL)) DEALLOCATE(D%NONL) + IF(ALLOCATED(D%NGPTOTL)) DEALLOCATE(D%NGPTOTL) + IF(ALLOCATED(D%NPROCA_GP)) DEALLOCATE(D%NPROCA_GP) + + IF(D%LWEIGHTED_DISTR) THEN + IF(ALLOCATED(D%RWEIGHT)) DEALLOCATE(D%RWEIGHT) + ENDIF + + IF(ALLOCATED(D%MYMS)) DEALLOCATE(D%MYMS) + IF(ALLOCATED(D%NUMPP)) DEALLOCATE(D%NUMPP) + IF(ALLOCATED(D%NPOSSP)) DEALLOCATE(D%NPOSSP) + IF(ALLOCATED(D%NPROCM)) DEALLOCATE(D%NPROCM) + IF(ALLOCATED(D%NDIM0G)) DEALLOCATE(D%NDIM0G) + IF(ALLOCATED(D%NASM0)) DEALLOCATE(D%NASM0) + IF(ALLOCATED(D%NATM0)) DEALLOCATE(D%NATM0) + IF(ALLOCATED(D%NLATLS)) DEALLOCATE(D%NLATLS) + IF(ALLOCATED(D%NLATLE)) DEALLOCATE(D%NLATLE) + IF(ALLOCATED(D%NPMT)) DEALLOCATE(D%NPMT) + IF(ALLOCATED(D%NPMS)) DEALLOCATE(D%NPMS) + IF(ALLOCATED(D%NPMG)) DEALLOCATE(D%NPMG) + IF(ALLOCATED(D%NULTPP)) DEALLOCATE(D%NULTPP) + IF(ALLOCATED(D%NPROCL)) DEALLOCATE(D%NPROCL) + IF(ALLOCATED(D%NPTRLS)) DEALLOCATE(D%NPTRLS) + IF(ALLOCATED(D%NALLMS)) DEALLOCATE(D%NALLMS) + IF(ALLOCATED(D%NPTRMS)) DEALLOCATE(D%NPTRMS) + IF(ALLOCATED(D%NSTAGT0B)) DEALLOCATE(D%NSTAGT0B) + IF(ALLOCATED(D%NSTAGT1B)) DEALLOCATE(D%NSTAGT1B) + IF(ALLOCATED(D%NPNTGTB0)) DEALLOCATE(D%NPNTGTB0) + IF(ALLOCATED(D%NPNTGTB1)) DEALLOCATE(D%NPNTGTB1) + IF(ALLOCATED(D%NLTSFTB)) DEALLOCATE(D%NLTSFTB) + IF(ALLOCATED(D%NLTSGTB)) DEALLOCATE(D%NLTSGTB) + IF(ALLOCATED(D%MSTABF)) DEALLOCATE(D%MSTABF) + IF(ALLOCATED(D%NSTAGTF)) DEALLOCATE(D%NSTAGTF) + + !TPM_FFT + IF (.NOT.D%LCPNMONLY) THEN + IF( ASSOCIATED(T) ) THEN + IF( ALLOCATED(T%TRIGS) ) DEALLOCATE(T%TRIGS) + IF( ALLOCATED(T%NFAX) ) DEALLOCATE(T%NFAX) +!! IF( ALLOCATED(T%LUSEFFT992)) DEALLOCATE(T%LUSEFFT992) + ENDIF + ENDIF + + + !TPM_FIELDS + IF(ALLOCATED(F%RMU)) DEALLOCATE(F%RMU) + IF(ALLOCATED(F%RW)) DEALLOCATE(F%RW) + IF(ALLOCATED(F%R1MU2)) DEALLOCATE(F%R1MU2) + IF(ALLOCATED(F%RACTHE)) DEALLOCATE(F%RACTHE) + IF(ALLOCATED(F%REPSNM)) DEALLOCATE(F%REPSNM) + IF(ALLOCATED(F%RN)) DEALLOCATE(F%RN) + IF(ALLOCATED(F%RLAPIN)) DEALLOCATE(F%RLAPIN) + IF(ALLOCATED(F%NLTN)) DEALLOCATE(F%NLTN) + IF( S%LKEEPRPNM ) THEN + IF(ALLOCATED(F%RPNM)) DEALLOCATE(F%RPNM) + ENDIF + IF( S%LDLL ) THEN + IF(ALLOCATED(F%RMU2)) DEALLOCATE(F%RMU2) + IF(ALLOCATED(F%RACTHE2)) DEALLOCATE(F%RACTHE2) + ENDIF + + !TPM_GEOMETRY + IF(ALLOCATED(G%NMEN)) DEALLOCATE(G%NMEN) + IF(ALLOCATED(G%NDGLU)) DEALLOCATE(G%NDGLU) + IF(ALLOCATED(G%NLOEN)) DEALLOCATE(G%NLOEN) + + LENABLED(KRESOL)=.FALSE. + NDEF_RESOL = COUNT(LENABLED) + ! Do not stay on a disabled resolution + DO JRESOL=1,SIZE(LENABLED) + IF (LENABLED(JRESOL)) THEN + CALL SET_RESOL(JRESOL) + EXIT + ENDIF + ENDDO + +ENDIF +! ------------------------------------------------------------------ + +END SUBROUTINE DEALLOC_RESOL +END MODULE DEALLOC_RESOL_MOD diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 new file mode 100755 index 00000000..a3ba882d --- /dev/null +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -0,0 +1,184 @@ +! (C) Copyright 2001- ECMWF. +! (C) Copyright 2001- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE DIR_TRANS_CTL_MOD +CONTAINS + SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,& + & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) + + !**** *DIR_TRANS_CTL* - Control routine for direct spectral transform. + + ! Purpose. + ! -------- + ! Control routine for the direct spectral transform + + !** Interface. + ! ---------- + ! CALL DIR_TRANS_CTL(...) + + ! Explicit arguments : + ! -------------------- + ! KF_UV_G - global number of spectral u-v fields + ! KF_SCALARS_G - global number of scalar spectral fields + ! KF_GP - total number of output gridpoint fields + ! KF_FS - total number of fields in fourier space + ! KF_UV - local number of spectral u-v fields + ! KF_SCALARS - local number of scalar spectral fields + ! PSPVOR(:,:) - spectral vorticity + ! PSPDIV(:,:) - spectral divergence + ! PSPSCALAR(:,:) - spectral scalarvalued fields + ! KVSETUV(:) - indicating which 'b-set' in spectral space owns a + ! vor/div field. Equivalant to NBSETLEV in the IFS. + ! The length of KVSETUV should be the GLOBAL number + ! of u/v fields which is the dimension of u and v releated + ! fields in grid-point space. + ! KVESETSC(:) - indicating which 'b-set' in spectral space owns a + ! scalar field. As for KVSETUV this argument is required + ! if the total number of processors is greater than + ! the number of processors used for distribution in + ! spectral wave space. + ! PGP(:,:,:) - gridpoint fields + + ! The ordering of the output fields is as follows (all + ! parts are optional depending on the input switches): + ! + ! u : KF_UV_G fields + ! v : KF_UV_G fields + ! scalar fields : KF_SCALARS_G fields + + ! Method. + ! ------- + + ! Externals. SHUFFLE - reshuffle fields for load balancing + ! ---------- FIELD_SPLIT - split fields in NPROMATR packets + ! LTDIR_CTL - control of Legendre transform + ! FTDIR_CTL - control of Fourier transform + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 01-01-03 + + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPRBT, JPRD, JPRB, JPIM + + USE TPM_GEN ,ONLY : NPROMATR, NOUT + USE TPM_DISTR, ONLY: NPROC + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_TRANS, ONLY: GROWING_ALLOCATION + USE TPM_GEN + USE BUFFERED_ALLOCATOR_MOD + + USE FTDIR_MOD + USE LTDIR_MOD + USE TRGTOL_MOD + USE TRLTOM_MOD + USE TRLTOM_PACK_UNPACK + + IMPLICIT NONE + + ! Declaration of arguments + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV_G + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G + INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3A(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3B(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGPUV(:,:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP3A(:,:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP3B(:,:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP2(:,:,:) + + ! Local variables + + INTEGER(KIND=JPIM) :: IPTRGP(KF_GP),IPTRSPUV(NPROMATR),IPTRSPSC(NPROMATR) + INTEGER(KIND=JPIM) :: ISHFUV_G(KF_GP),ISHFSC_G(KF_GP) + INTEGER(KIND=JPIM) :: IVSETUV(KF_GP),IVSETSC(KF_GP) + INTEGER(KIND=JPIM) :: IBLKS,JBLK,ISTUV_G,IENUV_G + INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP + INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB + + REAL(KIND=JPRBT), POINTER :: FOUBUF_IN(:), FOUBUF(:) + REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:), PREEL_COMPLEX(:) + + REAL(KIND=JPRBT), POINTER :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), POINTER :: ZINPS0(:), ZINPA0(:) + + TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR + TYPE(TRGTOL_HANDLE) :: HTRGTOL + TYPE(FTDIR_HANDLE) :: HFTDIR + TYPE(TRLTOM_PACK_HANDLE) :: HTRLTOM_PACK + TYPE(TRLTOM_HANDLE) :: HTRLTOM + TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK + TYPE(LTDIR_HANDLE) :: HLTDIR + + IF(NPROMATR > 0) THEN + PRINT *, "ERROR, not implemented right now (NPROMATR > 0)" + STOP 4 + ENDIF + + ! Prepare everything + ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() + HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) + HFTDIR = PREPARE_FTDIR() + HTRLTOM_PACK = PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) + HTRLTOM = PREPARE_TRLTOM(ALLOCATOR, KF_FS) + HTRLTOM_UNPACK = PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) + HLTDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) + + CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) + + ! from the PGP arrays to PREEL_REAL + CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& + & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& + & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& + & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) + + IF (KF_FS > 0) THEN + + ! fourier transform from PREEL_REAL to PREEL_COMPLEX (in-place!) + CALL GSTATS(1640,0) + CALL FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KF_FS) + CALL GSTATS(1640,1) + + CALL GSTATS(153,0) + + CALL TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) + CALL TRLTOM(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) + CALL TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + CALL GSTATS(153,1) + + CALL LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS, & + & PSPVOR,PSPDIV,PSPSCALAR,& + & PSPSC3A,PSPSC3B,PSPSC2) + + ENDIF + + END SUBROUTINE DIR_TRANS_CTL +END MODULE DIR_TRANS_CTL_MOD diff --git a/src/trans/gpu/internal/dist_grid_32_ctl_mod.F90 b/src/trans/gpu/internal/dist_grid_32_ctl_mod.F90 new file mode 100755 index 00000000..792d9896 --- /dev/null +++ b/src/trans/gpu/internal/dist_grid_32_ctl_mod.F90 @@ -0,0 +1,258 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE DIST_GRID_32_CTL_MOD +CONTAINS +SUBROUTINE DIST_GRID_32_CTL(PGPG,KFDISTG,KPROMA,KFROM,PGP) + +!**** *DIST_GRID_32_CTL* - Distributing global gridpoint array to processors + +! Purpose. +! -------- +! Routine for distributing gridpoint array + +!** Interface. +! ---------- +! CALL DIST_GRID_32_CTL(...) + +! Explicit arguments : +! -------------------- +! PGPG(:,:) - Global gridpoint array +! KFDISTG - Global number of fields to be distributed +! KPROMA - required blocking factor for gridpoint output +! KFROM(:) - Processor responsible for distributing each field +! PGP(:,:,:) - Local spectral array + +! Externals. SET2PE - compute "A and B" set from PE +! ---------- MPL.. - message passing routines + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 2000-04-01 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRM +USE MPL_MODULE + +USE TPM_DISTR +USE TPM_GEOMETRY + +USE SET2PE_MOD +USE ABORT_TRANS_MOD +USE EQ_REGIONS_MOD + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRM) ,OPTIONAL, INTENT(IN) :: PGPG(:,:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KFDISTG +INTEGER(KIND=JPIM) , INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) , INTENT(IN) :: KFROM(:) +REAL(KIND=JPRM) , INTENT(OUT) :: PGP(:,:,:) + +! Declaration of local variables + +REAL(KIND=JPRM) :: ZDUM(D%NGPTOTMX) +REAL(KIND=JPRM),ALLOCATABLE :: ZBUF(:,:,:),ZRCV2(:,:) +REAL(KIND=JPRM) :: ZRCV(D%NGPTOTMX,KFDISTG) +INTEGER(KIND=JPIM) :: JFLD,JB,JA,IGLOFF,IGL1,IGL2,IOFF,ILAST,ILOFF,ILENR +INTEGER(KIND=JPIM) :: JGL,JLON,ISND,ITAG,J,IRCV +INTEGER(KIND=JPIM) :: JKGLO,IEND,JROF,IBL,JROC +INTEGER(KIND=JPIM) :: ISENDREQ(NPROC,KFDISTG),ILEN(NPROC,KFDISTG) +INTEGER(KIND=JPIM) :: IFROM,IMYFIELDS,IFLD,IFLDSFROM(NPROC) +LOGICAL :: LLSAME + +! ------------------------------------------------------------------ + +! Copy for single PE + +IF(NPROC == 1) THEN +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF) + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JFLD=1,KFDISTG + DO JROF=1,IEND + PGP(JROF,JFLD,IBL) = PGPG(IOFF+JROF,JFLD) + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + +ELSEIF(KFDISTG>0) THEN + +! test if values in KFROM are all the same + LLSAME=.TRUE. + IFROM=KFROM(1) + DO JFLD=2,KFDISTG + IF(KFROM(JFLD) /= IFROM) THEN + LLSAME=.FALSE. + EXIT + ENDIF + ENDDO + + IMYFIELDS = 0 + DO JFLD=1,KFDISTG + IF(KFROM(JFLD) == MYPROC) THEN + IMYFIELDS = IMYFIELDS+1 + ENDIF + ENDDO + + CALL GSTATS(1663,0) + IF(IMYFIELDS > 0) THEN + ALLOCATE(ZBUF(D%NGPTOTMX,IMYFIELDS,NPROC)) + +!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1)& +!$OMP&PRIVATE(JFLD,JA,JB,ISND,IGLOFF,IGL1,IGL2,IOFF,ILAST,J,& +!$OMP&ILOFF,JGL,JLON) + DO JFLD=1,IMYFIELDS + DO JA=1,N_REGIONS_NS + DO JB=1,N_REGIONS(JA) + CALL SET2PE(ISND,JA,JB,0,0) + + IGLOFF = D%NPTRFRSTLAT(JA) + IGL1 = D%NFRSTLAT(JA) + IGL2 = D%NLSTLAT(JA) + IOFF = 0 + IF(JA > 1) THEN + IF( D%NLSTLAT(JA-1) == D%NFRSTLAT(JA) )THEN + ILAST = D%NLSTLAT(JA-1)-1 + ELSE + ILAST = D%NLSTLAT(JA-1) + ENDIF + DO J=D%NFRSTLAT(1),ILAST + IOFF = IOFF+G%NLOEN(J) + ENDDO + ENDIF + + ILEN(ISND,JFLD) = 0 + ILOFF = 0 + DO JGL=IGL1,IGL2 + DO JLON=1,D%NONL(IGLOFF+JGL-IGL1,JB) + ZBUF(ILEN(ISND,JFLD)+JLON,JFLD,ISND) = & + & PGPG(IOFF+ILOFF+D%NSTA(IGLOFF+JGL-IGL1,JB)+JLON-1,JFLD) + ENDDO + ILEN(ISND,JFLD) = ILEN(ISND,JFLD) + D%NONL(IGLOFF+JGL-IGL1,JB) + ILOFF = ILOFF + G%NLOEN(JGL) + ENDDO + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + ENDIF + CALL GSTATS(1663,1) + + ! Message passing + CALL GSTATS_BARRIER(791) + CALL GSTATS(811,0) + ! Send + IF( LLSAME )THEN + IF(KFROM(1) == MYPROC) THEN + ITAG = MTAGDISTGP + DO JROC=1,NPROC + CALL MPL_SEND(ZBUF(:,:,JROC),KDEST=NPRCIDS(JROC),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(JROC,1),& + &CDSTRING='DIST_GRID_32_CTL') + ENDDO + ENDIF + ELSE + IF(IMYFIELDS > 0) THEN + ITAG = MTAGDISTGP + DO JROC=1,NPROC + CALL MPL_SEND(ZBUF(:,:,JROC),KDEST=NPRCIDS(JROC),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(JROC,1),& + &CDSTRING='DIST_GRID_32_CTL') + ENDDO + ENDIF + ENDIF + + ! Receive + + IF( LLSAME )THEN + IRCV = KFROM(1) + ITAG = MTAGDISTGP + CALL MPL_RECV(ZRCV,KSOURCE=NPRCIDS(IRCV),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILENR,CDSTRING='DIST_GRID_32_CTL:') + IF( ILENR /= D%NGPTOTMX*KFDISTG )THEN + CALL ABORT_TRANS(' DIST_GRID_32_CTL: INVALID RECEIVE MESSAGE LENGTH 1') + ENDIF + ELSE + IFLDSFROM(:)=0 + DO JFLD=1,KFDISTG + IFLDSFROM(KFROM(JFLD)) = IFLDSFROM(KFROM(JFLD))+1 + ENDDO + ITAG = MTAGDISTGP + DO JROC=1,NPROC + IF(IFLDSFROM(JROC) > 0 ) THEN + IRCV = JROC + ALLOCATE(ZRCV2(D%NGPTOTMX,IFLDSFROM(JROC))) + CALL MPL_RECV(ZRCV2,KSOURCE=NPRCIDS(IRCV),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILENR,CDSTRING='DIST_GRID_32_CTL:') + IF( ILENR /= D%NGPTOTMX*IFLDSFROM(JROC) )THEN + CALL ABORT_TRANS(' DIST_GRID_32_CTL: INVALID RECEIVE MESSAGE LENGTH 2') + ENDIF + IFLD = 0 + DO JFLD=1,KFDISTG + IF(KFROM(JFLD) == JROC) THEN + IFLD = IFLD+1 + ZRCV(1:D%NGPTOT,JFLD) = ZRCV2(1:D%NGPTOT,IFLD) + ENDIF + ENDDO + DEALLOCATE(ZRCV2) + ENDIF + ENDDO + ENDIF + +! Wait for send to complete + + IF( LLSAME )THEN + IF(KFROM(1) == MYPROC) THEN + CALL MPL_WAIT(KREQUEST=ISENDREQ(:,1), & + & CDSTRING='DIST_GRID_32_CTL: WAIT 1') + ENDIF + ELSEIF(IMYFIELDS > 0) THEN + CALL MPL_WAIT(KREQUEST=ISENDREQ(:,1), & + & CDSTRING='DIST_GRID_32_CTL: WAIT 2') + ENDIF + CALL GSTATS(811,1) + CALL GSTATS_BARRIER2(791) + + CALL GSTATS(1663,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF) + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JFLD=1,KFDISTG + DO JROF=1,IEND + PGP(JROF,JFLD,IBL) = ZRCV(IOFF+JROF,JFLD) + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + CALL GSTATS(1663,1) + !Synchronize processors + CALL GSTATS(786,0) + CALL MPL_BARRIER(CDSTRING='DIST_GRID_32_CTL:') + CALL GSTATS(786,1) + IF(ALLOCATED(ZBUF)) DEALLOCATE(ZBUF) +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE DIST_GRID_32_CTL +END MODULE DIST_GRID_32_CTL_MOD diff --git a/src/trans/gpu/internal/dist_grid_ctl_mod.F90 b/src/trans/gpu/internal/dist_grid_ctl_mod.F90 new file mode 100755 index 00000000..184a7184 --- /dev/null +++ b/src/trans/gpu/internal/dist_grid_ctl_mod.F90 @@ -0,0 +1,280 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2013- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE DIST_GRID_CTL_MOD +CONTAINS +SUBROUTINE DIST_GRID_CTL(PGPG,KFDISTG,KPROMA,KFROM,PGP,KSORT) + +!**** *DIST_GRID_CTL* - Distributing global gridpoint array to processors + +! Purpose. +! -------- +! Routine for distributing gridpoint array + +!** Interface. +! ---------- +! CALL DIST_GRID_CTL(...) + +! Explicit arguments : +! -------------------- +! PGPG(:,:) - Global gridpoint array +! KFDISTG - Global number of fields to be distributed +! KPROMA - required blocking factor for gridpoint output +! KFROM(:) - Processor responsible for distributing each field +! PGP(:,:,:) - Local spectral array +! KSORT(:) - Add KSORT + +! Externals. SET2PE - compute "A and B" set from PE +! ---------- MPL.. - message passing routines + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 2000-04-01 +! P.Marguinaud : 2014-10-10 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB +USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_BARRIER, MPL_WAIT, & + & JP_BLOCKING_STANDARD, JP_NON_BLOCKING_STANDARD + +USE TPM_DISTR ,ONLY : D, MTAGDISTGP, NPRCIDS, MYPROC, NPROC +USE TPM_GEOMETRY ,ONLY : G + +USE SET2PE_MOD ,ONLY : SET2PE +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE EQ_REGIONS_MOD ,ONLY : N_REGIONS, N_REGIONS_NS +! + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGPG(:,:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KFDISTG +INTEGER(KIND=JPIM) , INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) , INTENT(IN) :: KFROM(:) +REAL(KIND=JPRB) , INTENT(OUT) :: PGP(:,:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN), TARGET :: KSORT (:) + +! Declaration of local variables + +! SS/2018: Removed stack hogs + +!REAL(KIND=JPRB) :: ZDUM(D%NGPTOTMX) -- not used +REAL(KIND=JPRB),ALLOCATABLE :: ZBUF(:,:,:) +REAL(KIND=JPRB),ALLOCATABLE :: ZRCV(:,:) ! (D%NGPTOTMX,KFDISTG) +INTEGER(KIND=JPIM) :: JFLD,JB,JA,IGLOFF,IGL1,IGL2,IOFF,ILAST,ILOFF,ILENR +INTEGER(KIND=JPIM) :: JGL,JLON,ISND,ITAG,J,IRCV +INTEGER(KIND=JPIM) :: JKGLO,IEND,JROF,IBL,JROC +INTEGER(KIND=JPIM) :: ISENDREQ(NPROC,KFDISTG),ILEN(NPROC,KFDISTG), IRECVREQ(KFDISTG) +INTEGER(KIND=JPIM) :: IFROM,IMYFIELDS,IFLD +INTEGER(KIND=JPIM), POINTER :: ISORT (:) +LOGICAL :: LLSAME + +! ------------------------------------------------------------------ + +IF (PRESENT (KSORT)) THEN + ISORT => KSORT +ELSE + ALLOCATE (ISORT (KFDISTG)) + DO JFLD = 1, KFDISTG + ISORT (JFLD) = JFLD + ENDDO +ENDIF + +! Copy for single PE + +IF(NPROC == 1) THEN +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF) + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JFLD=1,KFDISTG + DO JROF=1,IEND + PGP(JROF,ISORT(JFLD),IBL) = PGPG(IOFF+JROF,JFLD) + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + +ELSEIF(KFDISTG>0) THEN + +! test if values in KFROM are all the same + LLSAME=.TRUE. + IFROM=KFROM(1) + DO JFLD=2,KFDISTG + IF(KFROM(JFLD) /= IFROM) THEN + LLSAME=.FALSE. + EXIT + ENDIF + ENDDO + + IMYFIELDS = 0 + DO JFLD=1,KFDISTG + IF(KFROM(JFLD) == MYPROC) THEN + IMYFIELDS = IMYFIELDS+1 + ENDIF + ENDDO + + CALL GSTATS(1663,0) + IF(IMYFIELDS > 0) THEN + ALLOCATE(ZBUF(D%NGPTOTMX,IMYFIELDS,NPROC)) + +!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1)& +!$OMP&PRIVATE(JFLD,JA,JB,ISND,IGLOFF,IGL1,IGL2,IOFF,ILAST,J,& +!$OMP&ILOFF,JGL,JLON) + DO JFLD=1,IMYFIELDS + DO JA=1,N_REGIONS_NS + DO JB=1,N_REGIONS(JA) + CALL SET2PE(ISND,JA,JB,0,0) + + IGLOFF = D%NPTRFRSTLAT(JA) + IGL1 = D%NFRSTLAT(JA) + IGL2 = D%NLSTLAT(JA) + IOFF = 0 + IF(JA > 1) THEN + IF( D%NLSTLAT(JA-1) == D%NFRSTLAT(JA) )THEN + ILAST = D%NLSTLAT(JA-1)-1 + ELSE + ILAST = D%NLSTLAT(JA-1) + ENDIF + DO J=D%NFRSTLAT(1),ILAST + IOFF = IOFF+G%NLOEN(J) + ENDDO + ENDIF + + ILEN(ISND,JFLD) = 0 + ILOFF = 0 + DO JGL=IGL1,IGL2 + DO JLON=1,D%NONL(IGLOFF+JGL-IGL1,JB) + ZBUF(ILEN(ISND,JFLD)+JLON,JFLD,ISND) = & + & PGPG(IOFF+ILOFF+D%NSTA(IGLOFF+JGL-IGL1,JB)+JLON-1,JFLD) + ENDDO + ILEN(ISND,JFLD) = ILEN(ISND,JFLD) + D%NONL(IGLOFF+JGL-IGL1,JB) + ILOFF = ILOFF + G%NLOEN(JGL) + ENDDO + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + ENDIF + CALL GSTATS(1663,1) + + ! Message passing + CALL GSTATS_BARRIER(791) + CALL GSTATS(811,0) + ! Receive + + ALLOCATE(ZRCV(D%NGPTOTMX,KFDISTG)) + + IF( LLSAME )THEN + IRCV = KFROM(1) + ITAG = MTAGDISTGP + CALL MPL_RECV(ZRCV,KSOURCE=NPRCIDS(IRCV),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IRECVREQ(1),CDSTRING='DIST_GRID_CTL:') + ELSE + DO JFLD=1,KFDISTG + IRCV = KFROM(JFLD) + ITAG = MTAGDISTGP+JFLD + CALL MPL_RECV(ZRCV(:,JFLD),KSOURCE=NPRCIDS(IRCV),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IRECVREQ(JFLD),CDSTRING='DIST_GRID_CTL:') + ENDDO + ENDIF + + + ! Send + IF( LLSAME )THEN + IF(KFROM(1) == MYPROC) THEN + ITAG = MTAGDISTGP + DO JROC=1,NPROC + CALL MPL_SEND(ZBUF(:,:,JROC),KDEST=NPRCIDS(JROC),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(JROC,1),& + &CDSTRING='DIST_GRID_CTL') + ENDDO + ENDIF + ELSE + IFLD = 0 + DO JFLD=1,KFDISTG + IF(KFROM(JFLD) == MYPROC) THEN + IFLD = IFLD+1 + ITAG = MTAGDISTGP+JFLD + DO JROC=1,NPROC + CALL MPL_SEND(ZBUF(1:ILEN(JROC,IFLD),IFLD,JROC),KDEST=NPRCIDS(JROC),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(JROC,JFLD),& + &CDSTRING='DIST_GRID_CTL') + ENDDO + ENDIF + ENDDO + ENDIF + + +! Wait for sends and receives to complete + + IF( LLSAME )THEN + IF(KFROM(1) == MYPROC) THEN + CALL MPL_WAIT(KREQUEST=ISENDREQ(:,1), & + & CDSTRING='DIST_GRID_CTL: WAIT 1') + ENDIF + CALL MPL_WAIT(KREQUEST=IRECVREQ(1), & + & CDSTRING='DIST_GRID_CTL: WAIT 2') + ELSE + DO JFLD=1,KFDISTG + IF(KFROM(JFLD) == MYPROC) THEN + CALL MPL_WAIT(KREQUEST=ISENDREQ(:,JFLD), & + & CDSTRING='DIST_GRID_CTL: WAIT 3') + ENDIF + CALL MPL_WAIT(KREQUEST=IRECVREQ(JFLD), & + & CDSTRING='DIST_GRID_CTL: WAIT 4') + ENDDO + ENDIF + + CALL GSTATS(811,1) + CALL GSTATS_BARRIER2(791) + + CALL GSTATS(1663,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF) + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JFLD=1,KFDISTG + DO JROF=1,IEND + PGP(JROF,ISORT(JFLD),IBL) = ZRCV(IOFF+JROF,JFLD) + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + CALL GSTATS(1663,1) + DEALLOCATE(ZRCV) + !Synchronize processors + CALL GSTATS(786,0) + CALL MPL_BARRIER(CDSTRING='DIST_GRID_CTL:') + CALL GSTATS(786,1) + IF(ALLOCATED(ZBUF)) DEALLOCATE(ZBUF) +ENDIF + +IF (.NOT. PRESENT (KSORT)) THEN + DEALLOCATE (ISORT) +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE DIST_GRID_CTL +END MODULE DIST_GRID_CTL_MOD + + + + diff --git a/src/trans/gpu/internal/dist_spec_control_mod.F90 b/src/trans/gpu/internal/dist_spec_control_mod.F90 new file mode 100755 index 00000000..449889a9 --- /dev/null +++ b/src/trans/gpu/internal/dist_spec_control_mod.F90 @@ -0,0 +1,233 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE DIST_SPEC_CONTROL_MOD +CONTAINS +SUBROUTINE DIST_SPEC_CONTROL(PSPECG,KFDISTG,KFROM,KVSET,PSPEC,LDIM1_IS_FLD,& + & KSMAX,KSPEC2,KSPEC2_G,KPOSSP,KDIM0G,KSORT) + +!**** *DIST_SPEC_CONTROL* - Distribute global spectral array among processors + +! Purpose. +! -------- +! Routine for distributing spectral array + +!** Interface. +! ---------- +! CALL DIST_SPEC_CONTROL(...) + +! Explicit arguments : +! -------------------- +! PSPECG(:,:) - Global spectral array +! KFDISTG - Global number of fields to be distributed +! KFROM(:) - Processor resposible for distributing each field +! KVSET(:) - "B-Set" for each field +! PSPEC(:,:) - Local spectral array +! KSORT(:) - Re-order fields on output + +! Externals. SET2PE - compute "A and B" set from PE +! ---------- MPL.. - message passing routines + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 2000-04-01 +! P.Marguinaud : 2014-10-10 + +! ------------------------------------------------------------------ + + +USE PARKIND1 ,ONLY : JPIM ,JPRB +USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_BARRIER, MPL_WAIT, & + & JP_NON_BLOCKING_STANDARD + +!USE TPM_GEN +!USE TPM_DIM +USE TPM_DISTR ,ONLY : MTAGDISTSP, MYSETV, NPRCIDS, NPRTRW, MYPROC, NPROC + +USE SET2PE_MOD ,ONLY : SET2PE +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +! + +IMPLICIT NONE + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPECG(:,:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KFDISTG +INTEGER(KIND=JPIM) , INTENT(IN) :: KFROM(:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KVSET(:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPEC(:,:) +LOGICAL ,OPTIONAL, INTENT(IN) :: LDIM1_IS_FLD +INTEGER(KIND=JPIM) , INTENT(IN) :: KSMAX +INTEGER(KIND=JPIM) , INTENT(IN) :: KSPEC2 +INTEGER(KIND=JPIM) , INTENT(IN) :: KSPEC2_G +INTEGER(KIND=JPIM) , INTENT(IN) :: KPOSSP(:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KDIM0G(0:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN), TARGET :: KSORT (:) + +INTEGER(KIND=JPIM) :: IDIST(KSPEC2_G) +REAL(KIND=JPRB) :: ZFLD(KSPEC2) +REAL(KIND=JPRB),ALLOCATABLE :: ZBUF(:,:) +INTEGER(KIND=JPIM) :: JM,JN,II,IFLDR,IFLDS,JFLD,ITAG,JNM,IBSET,ILEN,JA,ISND +INTEGER(KIND=JPIM) :: IRCV,ISTA,ISTP,ILENR,ISENDREQ(NPRTRW*KFDISTG) +INTEGER(KIND=JPIM) :: ISMAX, ISPEC2, IPOS0,ISENT +INTEGER(KIND=JPIM), POINTER :: ISORT (:) + +! ------------------------------------------------------------------ + + +! Compute help array for distribution + +IF (PRESENT (KSORT)) THEN + ISORT => KSORT +ELSE + ALLOCATE (ISORT (KFDISTG)) + DO JFLD = 1, KFDISTG + ISORT (JFLD) = JFLD + ENDDO +ENDIF + +IF( NPROC == 1 ) THEN + CALL GSTATS(1644,0) + IF(LDIM1_IS_FLD) THEN +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JNM,JFLD) + DO JNM=1,KSPEC2_G + DO JFLD=1,KFDISTG + PSPEC(ISORT (JFLD),JNM) = PSPECG(JFLD,JNM) + ENDDO + ENDDO +!$OMP END PARALLEL DO + ELSE +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JNM,JFLD) + DO JFLD=1,KFDISTG + DO JNM=1,KSPEC2_G + PSPEC(JNM,ISORT (JFLD)) = PSPECG(JNM,JFLD) + ENDDO + ENDDO +!$OMP END PARALLEL DO + ENDIF + CALL GSTATS(1644,1) +ELSE + II = 0 + CALL GSTATS(1804,0) + DO JM=0,KSMAX + DO JN=JM,KSMAX + IDIST(II+1) = KDIM0G(JM)+(JN-JM)*2 + IDIST(II+2) = KDIM0G(JM)+(JN-JM)*2+1 + II = II+2 + ENDDO + ENDDO + CALL GSTATS(1804,1) + +!Distribute spectral array + + IFLDS = 0 + DO JFLD=1,KFDISTG + IF(KFROM(JFLD) == MYPROC) THEN + IFLDS = IFLDS+1 + ENDIF + ENDDO + ALLOCATE(ZBUF(KSPEC2_G,IFLDS)) + + CALL GSTATS(1644,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JNM,JFLD) + DO JFLD=1,IFLDS + IF(LDIM1_IS_FLD) THEN + DO JNM=1,KSPEC2_G + ZBUF(IDIST(JNM),JFLD) = PSPECG(JFLD,JNM) + ENDDO + ELSE + DO JNM=1,KSPEC2_G + ZBUF(IDIST(JNM),JFLD) = PSPECG(JNM,JFLD) + ENDDO + ENDIF + ENDDO +!$OMP END PARALLEL DO + CALL GSTATS(1644,1) + + IFLDR = 0 + IFLDS = 0 + ISENT = 0 + + CALL GSTATS_BARRIER(790) + CALL GSTATS(812,0) + DO JFLD=1,KFDISTG + + ! Send + IF(KFROM(JFLD) == MYPROC) THEN + IFLDS = IFLDS+1 + IBSET = KVSET(JFLD) + ITAG = MTAGDISTSP+JFLD + + DO JA=1,NPRTRW + ILEN = KPOSSP(JA+1)-KPOSSP(JA) + IF( ILEN > 0 )THEN + CALL SET2PE(ISND,0,0,JA,IBSET) + ISTA = KPOSSP(JA) + ISTP = ISTA+ILEN-1 + ISENT = ISENT+1 + CALL MPL_SEND(ZBUF(ISTA:ISTP,IFLDS),KDEST=NPRCIDS(ISND),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(ISENT),& + &CDSTRING='DIST_SPEC_CONTROL:') + ENDIF + ENDDO + ENDIF + ENDDO + + !Receive + DO JFLD=1,KFDISTG + IBSET = KVSET(JFLD) + IF( IBSET == MYSETV )THEN + ITAG = MTAGDISTSP+JFLD + IF( KSPEC2 > 0 )THEN + IRCV = KFROM(JFLD) + IFLDR = IFLDR+1 + IF(LDIM1_IS_FLD) THEN + CALL MPL_RECV(ZFLD,KSOURCE=NPRCIDS(IRCV),KTAG=ITAG,& + &KOUNT=ILENR,CDSTRING='DIST_SPEC_CONTROL:') + PSPEC(ISORT (IFLDR),1:KSPEC2) = ZFLD(:) + ELSE + CALL MPL_RECV(PSPEC(:,ISORT (IFLDR)),KSOURCE=NPRCIDS(IRCV),KTAG=ITAG,& + &KOUNT=ILENR,CDSTRING='DIST_SPEC_CONTROL:') + ENDIF + IF( ILENR /= KSPEC2 )THEN + CALL ABORT_TRANS('DIST_SPEC_CONTROL:INVALID RECEIVE MESSAGE LENGTH') + ENDIF + ENDIF + ENDIF + ENDDO + + DO JA=1,ISENT + CALL MPL_WAIT(KREQUEST=ISENDREQ(JA), & + & CDSTRING='DIST_SPEC_CTL: WAIT') + ENDDO + + CALL GSTATS(812,1) + CALL GSTATS_BARRIER2(790) + +!Synchronize processors + CALL GSTATS(787,0) + IF( NPROC > 1 )THEN + CALL MPL_BARRIER(CDSTRING='DIST_SPEC_CONTROL:') + ENDIF + CALL GSTATS(787,1) + IF(ALLOCATED(ZBUF)) DEALLOCATE(ZBUF) +ENDIF + +IF (.NOT. PRESENT (KSORT)) THEN + DEALLOCATE (ISORT) +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE DIST_SPEC_CONTROL +END MODULE DIST_SPEC_CONTROL_MOD diff --git a/src/trans/gpu/internal/ectrans_version_mod.F90.in b/src/trans/gpu/internal/ectrans_version_mod.F90.in new file mode 100644 index 00000000..88cae2da --- /dev/null +++ b/src/trans/gpu/internal/ectrans_version_mod.F90.in @@ -0,0 +1,47 @@ +! (C) Copyright 2023- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE ECTRANS_VERSION_MOD + +IMPLICIT NONE + +CONTAINS + + FUNCTION ECTRANS_VERSION_STR() + + !**** *ECTRANS_VERSION_STR* - Return ecTrans version as a string + + CHARACTER(LEN=LEN("@ectrans_VERSION_STR@")) :: ECTRANS_VERSION_STR + + ECTRANS_VERSION_STR = "@ectrans_VERSION_STR@" + END FUNCTION ECTRANS_VERSION_STR + + FUNCTION ECTRANS_VERSION_INT() + + !**** *ECTRANS_VERSION_INT* - Return ecTrans version as an integer + + USE PARKIND1 ,ONLY : JPIM + + INTEGER(KIND=JPIM) :: ECTRANS_VERSION_INT + + ECTRANS_VERSION_INT = 10000_JPIM * @ectrans_VERSION_MAJOR@ & + & + 100_JPIM * @ectrans_VERSION_MINOR@ & + & + 10_JPIM * @ectrans_VERSION_PATCH@ + END FUNCTION ECTRANS_VERSION_INT + + FUNCTION ECTRANS_GIT_SHA1() + + !**** *ECTRANS_GIT_SHA1* - Return the SHA-1 hash of the latest Git commit + + CHARACTER(LEN=LEN("@ectrans_GIT_SHA1@")) :: ECTRANS_GIT_SHA1 + + ECTRANS_GIT_SHA1 = "@ectrans_GIT_SHA1@" + END FUNCTION ECTRANS_GIT_SHA1 + +END MODULE ECTRANS_VERSION_MOD diff --git a/src/trans/gpu/internal/eq_regions_mod.F90 b/src/trans/gpu/internal/eq_regions_mod.F90 new file mode 100755 index 00000000..5888c107 --- /dev/null +++ b/src/trans/gpu/internal/eq_regions_mod.F90 @@ -0,0 +1,443 @@ +! (C) Copyright 2006- ECMWF. +! (C) Copyright 2006- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE eq_regions_mod +! +! Purpose. +! -------- +! eq_regions_mod provides the code to perform a high level +! partitioning of the surface of a sphere into regions of +! equal area and small diameter. +! the type. +! +! Background. +! ----------- +! This Fortran version of eq_regions is a much cut down version of the +! "Recursive Zonal Equal Area (EQ) Sphere Partitioning Toolbox" of the +! same name developed by Paul Leopardi at the University of New South Wales. +! This version has been coded specifically for the case of partitioning the +! surface of a sphere or S^dim (where dim=2) as denoted in the original code. +! Only a subset of the original eq_regions package has been coded to determine +! the high level distribution of regions on a sphere, as the detailed +! distribution of grid points to each region is left to IFS software. +! This is required to take into account the spatial distribution of grid +! points in an IFS gaussian grid and provide an optimal (i.e. exact) +! distribution of grid points over regions. +! +! The following copyright notice for the eq_regions package is included from +! the original MatLab release. +! +! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +! + Release 1.10 2005-06-26 + +! + + +! + Copyright (c) 2004, 2005, University of New South Wales + +! + + +! + Permission is hereby granted, free of charge, to any person obtaining + +! + a copy of this software and associated documentation files (the + +! + "Software"), to deal in the Software without restriction, including + +! + without limitation the rights to use, copy, modify, merge, publish, + +! + distribute, sublicense, and/or sell copies of the Software, and to + +! + permit persons to whom the Software is furnished to do so, subject to + +! + the following conditions: + +! + + +! + The above copyright notice and this permission notice shall be included + +! + in all copies or substantial portions of the Software. + +! + + +! + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + +! + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + +! + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + +! + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + +! + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + +! + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + +! + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +! + + +! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +! +! Author. +! ------- +! George Mozdzynski *ECMWF* +! +! Modifications. +! -------------- +! Original : 2006-04-15 +! +!-------------------------------------------------------------------------------- +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT + +IMPLICIT NONE + +SAVE + +PRIVATE + +PUBLIC eq_regions,l_regions_debug,n_regions_ns,n_regions_ew,n_regions,my_region_ns,my_region_ew +PUBLIC eq_regions_t, eq_regions_save, eq_regions_load, eq_regions_free + +real(kind=JPRBT) :: pi + +type eq_regions_t +logical :: l_regions_debug=.false. +integer(kind=jpim) :: n_regions_ns +integer(kind=jpim) :: n_regions_ew +integer(kind=jpim) :: my_region_ns +integer(kind=jpim) :: my_region_ew +integer(kind=jpim),pointer :: n_regions(:) => null () +end type eq_regions_t + +logical :: l_regions_debug=.false. +integer(kind=jpim) :: n_regions_ns +integer(kind=jpim) :: n_regions_ew +integer(kind=jpim) :: my_region_ns +integer(kind=jpim) :: my_region_ew +integer(kind=jpim),pointer :: n_regions(:) => null () + +CONTAINS + +subroutine eq_regions_save (yder) +type (eq_regions_t), intent (inout) :: yder + +yder%l_regions_debug = l_regions_debug +yder%n_regions_ns = n_regions_ns +yder%n_regions_ew = n_regions_ew +yder%my_region_ns = my_region_ns +yder%my_region_ew = my_region_ew +yder%n_regions => n_regions + +nullify (n_regions) + +end subroutine + +subroutine eq_regions_load (yder) +type (eq_regions_t), intent (inout) :: yder + +l_regions_debug = yder%l_regions_debug +n_regions_ns = yder%n_regions_ns +n_regions_ew = yder%n_regions_ew +my_region_ns = yder%my_region_ns +my_region_ew = yder%my_region_ew +n_regions => yder%n_regions + +nullify (yder%n_regions) + +end subroutine + +subroutine eq_regions_free (yder) +type (eq_regions_t), intent (inout) :: yder + +if (associated (yder%n_regions)) then + deallocate (yder%n_regions) + nullify (yder%n_regions) +endif + +end subroutine + +subroutine eq_regions(N) +! +! eq_regions uses the zonal equal area sphere partitioning algorithm to partition +! the surface of a sphere into N regions of equal area and small diameter. +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +integer(kind=jpim),intent(in) :: N +integer(kind=jpim) :: n_collars,j +real(kind=JPRBT),allocatable :: r_regions(:) +real(kind=JPRBT) :: c_polar + +pi=2.0_JPRBT*asin(1.0_JPRBT) + +n_regions(:)=0 + +if( N == 1 )then + + ! + ! We have only one region, which must be the whole sphere. + ! + n_regions(1)=1 + n_regions_ns=1 + +else + + ! + ! Given N, determine c_polar + ! the colatitude of the North polar spherical cap. + ! + c_polar = polar_colat(N) + ! + ! Given N, determine the ideal angle for spherical collars. + ! Based on N, this ideal angle, and c_polar, + ! determine n_collars, the number of collars between the polar caps. + ! + n_collars = num_collars(N,c_polar,ideal_collar_angle(N)) + n_regions_ns=n_collars+2 + ! + ! Given N, c_polar and n_collars, determine r_regions, + ! a list of the ideal real number of regions in each collar, + ! plus the polar caps. + ! The number of elements is n_collars+2. + ! r_regions[1] is 1. + ! r_regions[n_collars+2] is 1. + ! The sum of r_regions is N. + allocate(r_regions(n_collars+2)) + call ideal_region_list(N,c_polar,n_collars,r_regions) + ! + ! Given N and r_regions, determine n_regions, a list of the natural number + ! of regions in each collar and the polar caps. + ! This list is as close as possible to r_regions. + ! The number of elements is n_collars+2. + ! n_regions[1] is 1. + ! n_regions[n_collars+2] is 1. + ! The sum of n_regions is N. + ! + call round_to_naturals(N,n_collars,r_regions) + deallocate(r_regions) + if( N /= sum(n_regions(:)) )then + write(*,'("eq_regions: N=",I10," sum(n_regions(:))=",I10)')N,sum(n_regions(:)) + call abor1('eq_regions: N /= sum(n_regions)') + endif + +endif + +if( l_regions_debug )then + write(*,'("eq_regions: N=",I6," n_regions_ns=",I4)') N,n_regions_ns + do j=1,n_regions_ns + write(*,'("eq_regions: n_regions(",I4,")=",I4)') j,n_regions(j) + enddo +endif +n_regions_ew=maxval(n_regions(:)) + +return +end subroutine eq_regions + +function num_collars(N,c_polar,a_ideal) result(num_c) +! +!NUM_COLLARS The number of collars between the polar caps +! +! Given N, an ideal angle, and c_polar, +! determine n_collars, the number of collars between the polar caps. +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +integer(kind=jpim),intent(in) :: N +real(kind=JPRBT),intent(in) :: a_ideal,c_polar +integer(kind=jpim) :: num_c +logical enough +enough = (N > 2) .and. (a_ideal > 0) +if( enough )then + num_c = max(1,nint((pi-2.*c_polar)/a_ideal)) +else + num_c = 0 +endif +return +end function num_collars + +subroutine ideal_region_list(N,c_polar,n_collars,r_regions) +! +!IDEAL_REGION_LIST The ideal real number of regions in each zone +! +! List the ideal real number of regions in each collar, plus the polar caps. +! +! Given N, c_polar and n_collars, determine r_regions, a list of the ideal real +! number of regions in each collar, plus the polar caps. +! The number of elements is n_collars+2. +! r_regions[1] is 1. +! r_regions[n_collars+2] is 1. +! The sum of r_regions is N. +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +integer(kind=jpim),intent(in) :: N,n_collars +real(kind=JPRBT),intent(in) :: c_polar +real(kind=JPRBT),intent(out) :: r_regions(n_collars+2) +integer(kind=jpim) :: collar_n +real(kind=JPRBT) :: ideal_region_area,ideal_collar_area +real(kind=JPRBT) :: a_fitting +r_regions(:)=0.0_JPRBT +r_regions(1) = 1.0_JPRBT +if( n_collars > 0 )then + ! + ! Based on n_collars and c_polar, determine a_fitting, + ! the collar angle such that n_collars collars fit between the polar caps. + ! + a_fitting = (pi-2.0_JPRBT*c_polar)/float(n_collars) + ideal_region_area = area_of_ideal_region(N) + do collar_n=1,n_collars + ideal_collar_area = area_of_collar(c_polar+(collar_n-1)*a_fitting, & + & c_polar+collar_n*a_fitting) + r_regions(1+collar_n) = ideal_collar_area / ideal_region_area + enddo +endif +r_regions(2+n_collars) = 1. +return +end subroutine ideal_region_list + +function ideal_collar_angle(N) result(ideal) +! +! IDEAL_COLLAR_ANGLE The ideal angle for spherical collars of an EQ partition +! +! IDEAL_COLLAR_ANGLE(N) sets ANGLE to the ideal angle for the +! spherical collars of an EQ partition of the unit sphere S^2 into N regions. +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +integer(kind=jpim),intent(in) :: N +real(kind=JPRBT) :: ideal +ideal = area_of_ideal_region(N)**(0.5_JPRBT) +return +end function ideal_collar_angle + +subroutine round_to_naturals(N,n_collars,r_regions) +! +! ROUND_TO_NATURALS Round off a given list of numbers of regions +! +! Given N and r_regions, determine n_regions, a list of the natural number +! of regions in each collar and the polar caps. +! This list is as close as possible to r_regions, using rounding. +! The number of elements is n_collars+2. +! n_regions[1] is 1. +! n_regions[n_collars+2] is 1. +! The sum of n_regions is N. +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +integer(kind=jpim),intent(in) :: N,n_collars +real(kind=JPRBT),intent(in) :: r_regions(n_collars+2) +integer(kind=jpim) :: zone_n +real(kind=JPRBT) :: discrepancy +n_regions(1:n_collars+2) = r_regions(:) +discrepancy = 0.0_JPRBT +do zone_n = 1,n_collars+2 + n_regions(zone_n) = nint(r_regions(zone_n)+discrepancy); + discrepancy = discrepancy+r_regions(zone_n)-float(n_regions(zone_n)); +enddo +return +end subroutine round_to_naturals + +function polar_colat(N) result(polar_c) +! +! Given N, determine the colatitude of the North polar spherical cap. +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +integer(kind=jpim),intent(in) :: N +real(kind=JPRBT) :: area +real(kind=JPRBT) :: polar_c +if( N == 1 ) polar_c=pi +if( N == 2 ) polar_c=pi/2.0_JPRBT +if( N > 2 )then + area=area_of_ideal_region(N) + polar_c=sradius_of_cap(area) +endif +return +end function polar_colat + +function area_of_ideal_region(N) result(area) +! +! AREA_OF_IDEAL_REGION(N) sets AREA to be the area of one of N equal +! area regions on S^2, that is 1/N times AREA_OF_SPHERE. +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +integer(kind=jpim),intent(in) :: N +real(kind=JPRBT) :: area_of_sphere +real(kind=JPRBT) :: area +area_of_sphere = (2.0_JPRBT*pi**1.5_JPRBT/gamma(1.5_JPRBT)) +area = area_of_sphere/float(N) +return +end function area_of_ideal_region + +function sradius_of_cap(area) result(sradius) +! +! SRADIUS_OF_CAP(AREA) returns the spherical radius of +! an S^2 spherical cap of area AREA. +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +real(kind=JPRBT),intent(in) :: area +real(kind=JPRBT) :: sradius +sradius = 2.0_JPRBT*asin(sqrt(area/pi)/2.0_JPRBT) +return +end function sradius_of_cap + +function area_of_collar(a_top, a_bot) result(area) +! +! AREA_OF_COLLAR Area of spherical collar +! +! AREA_OF_COLLAR(A_TOP, A_BOT) sets AREA to be the area of an S^2 spherical +! collar specified by A_TOP, A_BOT, where A_TOP is top (smaller) spherical radius, +! A_BOT is bottom (larger) spherical radius. +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +real(kind=JPRBT),intent(in) :: a_top,a_bot +real(kind=JPRBT) area +area = area_of_cap(a_bot) - area_of_cap(a_top) +return +end function area_of_collar + +function area_of_cap(s_cap) result(area) +! +! AREA_OF_CAP Area of spherical cap +! +! AREA_OF_CAP(S_CAP) sets AREA to be the area of an S^2 spherical +! cap of spherical radius S_CAP. +! +real(kind=JPRBT),intent(in) :: s_cap +real(kind=JPRBT) area +area = 4.0_JPRBT*pi * sin(s_cap/2.0_JPRBT)**2 +return +end function area_of_cap + +function gamma(x) result(gamma_res) +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +IMPLICIT NONE +real(kind=JPRBT),intent(in) :: x +real(kind=JPRBT) :: gamma_res +real(kind=JPRBT) :: p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13 +real(kind=JPRBT) :: w,y +integer(kind=jpim) :: k,n +parameter (& +& p0 = 0.999999999999999990e+00_JPRBT,& +& p1 = -0.422784335098466784e+00_JPRBT,& +& p2 = -0.233093736421782878e+00_JPRBT,& +& p3 = 0.191091101387638410e+00_JPRBT,& +& p4 = -0.024552490005641278e+00_JPRBT,& +& p5 = -0.017645244547851414e+00_JPRBT,& +& p6 = 0.008023273027855346e+00_JPRBT) +parameter (& +& p7 = -0.000804329819255744e+00_JPRBT,& +& p8 = -0.000360837876648255e+00_JPRBT,& +& p9 = 0.000145596568617526e+00_JPRBT,& +& p10 = -0.000017545539395205e+00_JPRBT,& +& p11 = -0.000002591225267689e+00_JPRBT,& +& p12 = 0.000001337767384067e+00_JPRBT,& +& p13 = -0.000000199542863674e+00_JPRBT) +n = nint(x - 2) +w = x - (n + 2) +y = ((((((((((((p13 * w + p12) * w + p11) * w + p10) *& +& w + p9) * w + p8) * w + p7) * w + p6) * w + p5) *& +& w + p4) * w + p3) * w + p2) * w + p1) * w + p0 +if (n .gt. 0) then + w = x - 1 + do k = 2, n + w = w * (x - k) + end do +else + w = 1 + do k = 0, -n - 1 + y = y * (x + k) + end do +end if +gamma_res = w / y +return +end function gamma + +END MODULE eq_regions_mod diff --git a/src/trans/gpu/internal/ext_acc.F90 b/src/trans/gpu/internal/ext_acc.F90 new file mode 100644 index 00000000..bf42d9a5 --- /dev/null +++ b/src/trans/gpu/internal/ext_acc.F90 @@ -0,0 +1,357 @@ +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +module openacc_ext_type + use iso_c_binding + implicit none + private + public :: ext_acc_arr_desc + + ! to my knowledge, this cannot be part of openacc_ext + type ext_acc_arr_desc + integer(c_size_t) :: ptr, sz + end type +end module +module openacc_ext + use iso_c_binding + use iso_fortran_env + use openacc, only : acc_create, acc_copyin, acc_handle_kind + use openacc_ext_type + implicit none + + private + public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, & + & ext_acc_delete, ext_acc_arr_desc, acc_handle_kind + + type common_pointer_descr + type(c_ptr) :: ptr + integer(c_size_t) :: sz + end type + + interface ext_acc_pass + module procedure ext_acc_pass_2d_r4, ext_acc_pass_3d_r4, ext_acc_pass_4d_r4, ext_acc_pass_2d_r8, ext_acc_pass_3d_r8, ext_acc_pass_4d_r8 + end interface +contains + + function ext_acc_pass_2d_r4(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(4), intent(in) :: arr(:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 2) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) + ptr2 = c_loc(arr(lbound(arr,1)+1, lbound(arr,2))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 1) + end function + function ext_acc_pass_3d_r4(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(4), intent(in) :: arr(:,:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 3) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1, lbound(arr,3))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 2) + end function + function ext_acc_pass_4d_r4(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(4), intent(in) :: arr(:,:,:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 4) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3)+1, lbound(arr,4))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 3) + end function + function ext_acc_pass_2d_r8(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(8), intent(in) :: arr(:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 2) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) + ptr2 = c_loc(arr(lbound(arr,1)+1, lbound(arr,2))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 1) + end function + function ext_acc_pass_3d_r8(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(8), intent(in) :: arr(:,:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 3) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1, lbound(arr,3))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 2) + end function + function ext_acc_pass_4d_r8(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(8), intent(in) :: arr(:,:,:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 4) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3)+1, lbound(arr,4))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 3) + end function + function get_common_pointers(in_ptrs, out_ptrs) result(num_ranges) + implicit none + type(ext_acc_arr_desc), intent(in) :: in_ptrs(:) + type(common_pointer_descr), intent(out) :: out_ptrs(:) + + integer(c_size_t), allocatable :: ptrs_only(:) + logical, allocatable :: mask(:) + integer, allocatable :: sort_index(:) + + type(ext_acc_arr_desc), allocatable :: common_ptrs(:) + integer :: i, j, num_ranges + integer(c_size_t) :: start1, start2, end1, end2 + logical :: found + + ! first sort the pointers increasingly such that no gaps are possible + allocate(ptrs_only(size(in_ptrs))) + do i = 1, size(in_ptrs) + ptrs_only(i) = in_ptrs(i)%ptr + enddo + allocate(mask(size(in_ptrs))) + do i = 1, size(in_ptrs) + mask(i) = .true. + enddo + allocate(sort_index(size(in_ptrs))) + do i = 1, size(in_ptrs) + j = minloc(ptrs_only, 1, mask=mask) + mask(j) = .false. + sort_index(i) = j + enddo + + ! initialize + allocate(common_ptrs(size(in_ptrs))) + do i = 1, size(in_ptrs) + common_ptrs(1)%ptr = 0 + common_ptrs(1)%sz = 0 + enddo + + num_ranges = 1 + common_ptrs(1) = in_ptrs(sort_index(1)) + do i = 2, size(in_ptrs) + found = .false. + start1 = in_ptrs(sort_index(i))%ptr + end1 = in_ptrs(sort_index(i))%ptr + in_ptrs(sort_index(i))%sz + do j = 1, num_ranges + start2 = common_ptrs(j)%ptr + end2 = common_ptrs(j)%ptr + common_ptrs(j)%sz + if (max(start1, start2) <= min(end1, end2)) then + ! if we intersect with this range, extend the range + common_ptrs(j)%ptr = min(start1, start2) + common_ptrs(j)%sz = max(end1, end2) - common_ptrs(j)%ptr + found = .true. + exit + endif + enddo + if (.not. found) then + ! if we did not find anything: add a new one + num_ranges = num_ranges + 1 + common_ptrs(num_ranges)%ptr = start1 + common_ptrs(num_ranges)%sz = end1 - start1 + endif + enddo + do i = 1, num_ranges + out_ptrs(i)%ptr = transfer(common_ptrs(i)%ptr, out_ptrs(i)%ptr) + out_ptrs(i)%sz = common_ptrs(i)%sz + enddo + end function + subroutine ext_acc_create(ptrs, stream) + use openacc, only : acc_create, acc_async_sync + implicit none + type(ext_acc_arr_desc), intent(in) :: ptrs(:) + integer(acc_handle_kind), optional :: stream + + type(common_pointer_descr), allocatable :: common_ptrs(:) + + integer :: i, num_ranges + integer(kind=int32), pointer :: pp(:) + integer(acc_handle_kind) :: stream_act + + if (present(stream)) then + stream_act = stream + else + stream_act = acc_async_sync + endif + allocate(common_ptrs(size(ptrs))) + num_ranges = get_common_pointers(ptrs, common_ptrs) + + do i = 1, num_ranges + call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) + !!call acc_create_async(pp, common_ptrs(i)%sz, async=stream_act) + call acc_create(pp, int(common_ptrs(i)%sz)) + enddo + end subroutine + subroutine ext_acc_copyin(ptrs, stream) + use openacc + implicit none + type(ext_acc_arr_desc), intent(in) :: ptrs(:) + integer(acc_handle_kind), optional :: stream + + type(common_pointer_descr), allocatable :: common_ptrs(:) + + integer :: i, num_ranges + integer(4), pointer :: pp(:) + + integer(acc_handle_kind) :: stream_act + + if (present(stream)) then + stream_act = stream + else + stream_act = acc_async_sync + endif + allocate(common_ptrs(size(ptrs))) + num_ranges = get_common_pointers(ptrs, common_ptrs) + + do i = 1, num_ranges + call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) + !!call acc_copyin_async(pp, common_ptrs(i)%sz, async=stream_act) + call acc_copyin(pp, int(common_ptrs(i)%sz)) + enddo + end subroutine + subroutine ext_acc_copyout(ptrs, stream) + use openacc + implicit none + type(ext_acc_arr_desc), intent(in) :: ptrs(:) + integer(acc_handle_kind), optional :: stream + + type(common_pointer_descr), allocatable :: common_ptrs(:) + + integer :: i, num_ranges + integer(4), pointer :: pp(:) + + integer(acc_handle_kind) :: stream_act + + if (present(stream)) then + stream_act = stream + else + stream_act = acc_async_sync + endif + allocate(common_ptrs(size(ptrs))) + num_ranges = get_common_pointers(ptrs, common_ptrs) + + do i = 1, num_ranges + call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) + !!call acc_copyout_async(pp, common_ptrs(i)%sz, async=stream_act) + call acc_copyout(pp, int(common_ptrs(i)%sz)) + enddo + end subroutine + subroutine ext_acc_delete(ptrs, stream) + use openacc + implicit none + type(ext_acc_arr_desc), intent(in) :: ptrs(:) + integer(acc_handle_kind), optional :: stream + + type(common_pointer_descr), allocatable :: common_ptrs(:) + + integer :: i, num_ranges + integer(4), pointer :: pp(:) + + integer(acc_handle_kind) :: stream_act + + if (present(stream)) then + stream_act = stream + else + stream_act = acc_async_sync + endif + allocate(common_ptrs(size(ptrs))) + num_ranges = get_common_pointers(ptrs, common_ptrs) + + do i = 1, num_ranges + call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) + !!call acc_delete_async(pp, common_ptrs(i)%sz, async=stream_act) + call acc_delete(pp, int(common_ptrs(i)%sz)) + enddo + end subroutine +end module diff --git a/src/trans/gpu/internal/field_split_mod.F90 b/src/trans/gpu/internal/field_split_mod.F90 new file mode 100755 index 00000000..20719e6c --- /dev/null +++ b/src/trans/gpu/internal/field_split_mod.F90 @@ -0,0 +1,140 @@ +! (C) Copyright 2001- ECMWF. +! (C) Copyright 2001- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE FIELD_SPLIT_MOD +CONTAINS +SUBROUTINE FIELD_SPLIT(KBLK,KF_GP,KKF_UV_G,KVSETUV,KVSETSC,& + & KSTUV_G,KENUV_G,KF_UV_G,KSTSC_G,KENSC_G,KF_SCALARS_G,& + & KSTUV,KENUV,KF_UV,KSTSC,KENSC,KF_SCALARS) + +!**** *FIELD_SPLIT* - Split fields + +! Purpose. +! -------- +! Split fields + +!** Interface. +! ---------- +! CALL FIELD_SPLIT(...) + +! Explicit arguments : +! -------------------- +! KBLK - block number +! KF_GP - total number of output gridpoint fields +! KKF_UV_G - global number of spectral u-v fields +! KVSETUV - IVSETUV from SHUFFLE +! KVSETSC - IVSETUV from SHUFFLE + +! All the following output arguments are quantities for THIS packet. +! KSTUV_G - +! KENUV_G - +! KF_UV_G - +! KSTSC_G - +! KENSC_G - +! KF_SCALARS_G - +! KSTUV - +! KENUV - +! KF_UV - +! KSTSC - +! KENSC - +! KF_SCALARS - + +! Externals. NONE +! ---------- + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 01-01-03 + +! ------------------------------------------------------------------ +USE PARKIND1 ,ONLY : JPIM + +USE TPM_GEN ,ONLY : NPROMATR +!USE TPM_TRANS +USE TPM_DISTR ,ONLY : MYSETV, NPRTRV +! + +IMPLICIT NONE + +! Declaration of arguments + +INTEGER(KIND=JPIM), INTENT(IN) :: KBLK,KF_GP,KKF_UV_G +INTEGER(KIND=JPIM), INTENT(IN) :: KVSETUV(:),KVSETSC(:) +INTEGER(KIND=JPIM), INTENT(OUT) :: KSTUV_G,KENUV_G,KF_UV_G,KSTSC_G,KENSC_G,KF_SCALARS_G +INTEGER(KIND=JPIM), INTENT(OUT) :: KSTUV,KENUV,KF_UV,KSTSC,KENSC,KF_SCALARS + +! Local variables + +INTEGER(KIND=JPIM) :: ISTF,IENF,J + +! ------------------------------------------------------------------ + +ISTF = (KBLK-1)*NPROMATR+1 +IENF = MIN(KBLK*NPROMATR,KF_GP) + +KSTUV_G = (KBLK-1)*NPROMATR/2+1 +KENUV_G = MIN(KBLK*NPROMATR/2,KKF_UV_G) +IF(ISTF > 2*KKF_UV_G) KSTUV_G = KENUV_G+1 +KF_UV_G = KENUV_G-KSTUV_G+1 +KSTSC_G = MAX(ISTF-2*KKF_UV_G,1) +KENSC_G = MAX(IENF-2*KKF_UV_G,0) +KF_SCALARS_G = KENSC_G-KSTSC_G+1 + +! Spectral fields distributed over fields + +IF(NPRTRV > 1) THEN + KF_UV = 0 + KSTUV = 1 + DO J=1,KSTUV_G-1 + IF(KVSETUV(J) == MYSETV) THEN + KSTUV = KSTUV+1 + ENDIF + ENDDO + KENUV = KSTUV-1 + DO J=KSTUV_G,KENUV_G + IF(KVSETUV(J) == MYSETV) THEN + KF_UV = KF_UV+1 + KENUV = KENUV+1 + ENDIF + ENDDO + KF_SCALARS = 0 + KSTSC = 1 + DO J=1,KSTSC_G-1 + IF(KVSETSC(J) == MYSETV) THEN + KSTSC =KSTSC+1 + ENDIF + ENDDO + KENSC = KSTSC-1 + DO J=KSTSC_G,KENSC_G + IF(KVSETSC(J) == MYSETV) THEN + KF_SCALARS = KF_SCALARS+1 + KENSC = KENSC+1 + ENDIF + ENDDO +ELSE + + ! Spectral fields not distributed over fields + + KF_UV = KF_UV_G + KSTUV = KSTUV_G + KENUV = KENUV_G + KF_SCALARS = KF_SCALARS_G + KSTSC = KSTSC_G + KENSC = KENSC_G +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE FIELD_SPLIT +END MODULE FIELD_SPLIT_MOD diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 new file mode 100755 index 00000000..6c0a2d16 --- /dev/null +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -0,0 +1,280 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE FSC_MOD + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: FSC, PREPARE_FSC, FSC_HANDLE + + TYPE FSC_HANDLE + END TYPE + +CONTAINS + FUNCTION PREPARE_FSC(ALLOCATOR) RESULT(HFSC) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + TYPE(FSC_HANDLE) :: HFSC + END FUNCTION +SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & + & KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) + +!**** *FSC - Division by a*cos(theta), east-west derivatives + +! Purpose. +! -------- +! In Fourier space divide u and v and all north-south +! derivatives by a*cos(theta). Also compute east-west derivatives +! of u,v,thermodynamic, passiv scalar variables and surface +! pressure. + +!** Interface. +! ---------- +! CALL FSC(..) +! Explicit arguments : KF_FS - total stride +! -------------------- KF_UV - # uv layers +! KF_SCALARS - # scalar layers +! *_OFFSET - offset of the respective layer +! +! Method. +! ------- + +! Externals. None. +! ---------- + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-03-03 (From SC2FSC) + +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + +USE TPM_TRANS ,ONLY : LATLON +USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NUMP, D_NPTRLS, D_NSTAGTF +USE TPM_GEOMETRY ,ONLY : G_NMEN, G_NLOEN, G_NLOEN_MAX +USE TPM_FIELDS ,ONLY : F_RACTHE +USE TPM_GEN ,ONLY : NOUT +USE TPM_DIM ,ONLY : R_NSMAX +! + +IMPLICIT NONE +REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL_COMPLEX(:) +INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV, KF_SCALARS +INTEGER(KIND=JPIM), INTENT(IN) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET +TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR +TYPE(FSC_HANDLE), INTENT(IN) :: HFSC + +INTEGER(KIND=JPIM) :: KGL + +REAL(KIND=JPRBT) :: ZACHTE2 +REAL(KIND=JPRBT) :: ZAMP, ZPHASE +INTEGER(KIND=JPIM) :: IOFF_LAT,OFFSET_VAR +INTEGER(KIND=JPIM) :: IOFF_SCALARS,IOFF_SCALARS_EWDER,IOFF_UV,IOFF_UV_EWDER,IOFF_KSCALARS_NSDER +INTEGER(KIND=JPIM) :: JF,IGLG,II,JM +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX + + +! ------------------------------------------------------------------ + +IF(MYPROC > NPROC/2)THEN + IBEG=1 + IEND=D%NDGL_FS + IINC=1 +ELSE + IBEG=D%NDGL_FS + IEND=1 + IINC=-1 +ENDIF + +#ifdef ACCGPU +!$ACC DATA & +!$ACC& PRESENT(D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F_RACTHE,G_NMEN,G_NLOEN, G_NLOEN_MAX, R_NSMAX) +#endif +#ifdef OMPGPU +!$OMP TARGET DATA MAP(PRESENT,ALLOC:ZGTF) & +!$OMP& MAP(ALLOC:PUV,PSCALAR,PNSDERS,PEWDERS,PUVDERS) +#endif + +! ------------------------------------------------------------------ + +!* 1. DIVIDE U V AND N-S DERIVATIVES BY A*COS(THETA) +! ---------------------------------------------- + +OFFSET_VAR=D%NPTRLS(MYSETW) + +!* 1.1 U AND V. +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_UV,PUV,ZACHTE2) +#endif +#ifdef ACCGPU +!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_UV,ZACHTE2,JM,JF,KGL) & +!$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_OFFSET,KF_FS) ASYNC(1) +#endif +DO KGL=IBEG,IEND,IINC + DO JF=1,2*KF_UV + DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) + IGLG = OFFSET_VAR+KGL-1 + IF (JM <= G_NMEN(IGLG)) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + ZACHTE2 = F_RACTHE(IGLG) + + PREEL_COMPLEX(IOFF_UV+2*JM+1) = & + & PREEL_COMPLEX(IOFF_UV+2*JM+1)*ZACHTE2 + PREEL_COMPLEX(IOFF_UV+2*JM+2) = & + & PREEL_COMPLEX(IOFF_UV+2*JM+2)*ZACHTE2 + ENDIF + ENDDO + ENDDO +ENDDO + +!* 1.2 N-S DERIVATIVES + +IF (KSCALARS_NSDER_OFFSET >= 0) THEN +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_SCALARS,PNSDERS,ZACHTE2) +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_KSCALARS_NSDER,ZACHTE2,KGL,JF,JM) & + !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_SCALARS,KSCALARS_NSDER_OFFSET,KF_FS) ASYNC(1) +#endif + DO KGL=IBEG,IEND,IINC + DO JF=1,KF_SCALARS + DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) + IGLG = OFFSET_VAR+KGL-1 + IF (JM <= G_NMEN(IGLG)) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_KSCALARS_NSDER = IOFF_LAT+(KSCALARS_NSDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + ZACHTE2 = F_RACTHE(IGLG) + + PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+1) = & + & PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+1)*ZACHTE2 + PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+2) = & + & PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+2)*ZACHTE2 + ENDIF + ENDDO + ENDDO + ENDDO +ENDIF + +! ------------------------------------------------------------------ + +!* 2. EAST-WEST DERIVATIVES +! --------------------- + +!* 2.1 U AND V. + +IF (KUV_EWDER_OFFSET >= 0) THEN +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_UV,PUVDERS,ZACHTE2,PUV) +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_UV,IOFF_UV_EWDER,RET_REAL,RET_COMPLEX,ZACHTE2,JM,JF,KGL) & + !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_EWDER_OFFSET,KUV_OFFSET,KF_FS) ASYNC(1) +#endif + DO KGL=IBEG,IEND,IINC + DO JF=1,2*KF_UV + DO JM=0,G_NLOEN_MAX/2 + IGLG = OFFSET_VAR+KGL-1 + ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have + ! to fill those floor(NLON/2)+1 values. + ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. + IF (JM <= G_NLOEN(IGLG)/2) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + IOFF_UV_EWDER = IOFF_LAT+(KUV_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + RET_REAL = 0.0_JPRBT + RET_COMPLEX = 0.0_JPRBT + + IF (JM <= G_NMEN(IGLG)) THEN + ZACHTE2 = F_RACTHE(IGLG) + + RET_REAL = & + & -PREEL_COMPLEX(IOFF_UV+2*JM+2)*ZACHTE2*REAL(JM,JPRBT) + RET_COMPLEX = & + & PREEL_COMPLEX(IOFF_UV+2*JM+1)*ZACHTE2*REAL(JM,JPRBT) + ENDIF + PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+1) = RET_REAL + PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+2) = RET_COMPLEX + ENDIF + ENDDO + ENDDO + ENDDO +ENDIF + +!* 2.2 SCALAR VARIABLES + +IF (KSCALARS_EWDER_OFFSET > 0) THEN +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_SCALARS,PEWDERS,ZACHTE2,PSCALAR) +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2,RET_REAL,RET_COMPLEX) & + !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,KF_SCALARS,OFFSET_VAR,KSCALARS_EWDER_OFFSET,KSCALARS_OFFSET,KF_FS) ASYNC(1) +#endif + DO KGL=IBEG,IEND,IINC + DO JF=1,KF_SCALARS + DO JM=0,G_NLOEN_MAX/2 + IGLG = OFFSET_VAR+KGL-1 + ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have + ! to fill those floor(NLON/2)+1 values. + ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. + IF (JM <= G_NLOEN(IGLG)/2) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_SCALARS_EWDER = IOFF_LAT+(KSCALARS_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + IOFF_SCALARS = IOFF_LAT+(KSCALARS_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + RET_REAL = 0.0_JPRBT + RET_COMPLEX = 0.0_JPRBT + + IF (JM <= G_NMEN(IGLG)) THEN + ZACHTE2 = F_RACTHE(IGLG) + + RET_REAL = & + & -PREEL_COMPLEX(IOFF_SCALARS+2*JM+2)*ZACHTE2*REAL(JM,JPRBT) + RET_COMPLEX = & + & PREEL_COMPLEX(IOFF_SCALARS+2*JM+1)*ZACHTE2*REAL(JM,JPRBT) + ENDIF + ! The rest from G_NMEN(IGLG+1)...MAX is zero truncated + PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+1) = RET_REAL + PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+2) = RET_COMPLEX + ENDIF + ENDDO + ENDDO + ENDDO +ENDIF + +#ifdef ACCGPU +!$ACC WAIT(1) + +!$ACC END DATA +#endif +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif +! ------------------------------------------------------------------ + +END SUBROUTINE FSC +END MODULE FSC_MOD diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 new file mode 100755 index 00000000..b131947c --- /dev/null +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -0,0 +1,118 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE FTDIR_MOD + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: FTDIR, FTDIR_HANDLE, PREPARE_FTDIR + + TYPE FTDIR_HANDLE + END TYPE +CONTAINS + + FUNCTION PREPARE_FTDIR() RESULT(HFTDIR) + IMPLICIT NONE + TYPE(FTDIR_HANDLE) :: HFTDIR + END FUNCTION + + SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) + !**** *FTDIR - Direct Fourier transform + + ! Purpose. Routine for Grid-point to Fourier transform + ! -------- + + !** Interface. + ! ---------- + ! CALL FTDIR(..) + + ! Explicit arguments : PREEL - Fourier/grid-point array + ! -------------------- KFIELD - number of fields + + ! Method. + ! ------- + + ! Externals. FFT992 - FFT routine + ! ---------- + ! + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 00-03-03 + ! G. Radnoti 01-04-24 2D model (NLOEN=1) + ! D. Degrauwe (Feb 2012): Alternative extension zone (E') + ! G. Mozdzynski (Oct 2014): support for FFTW transforms + ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW + ! ------------------------------------------------------------------ + + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT + + USE TPM_DISTR ,ONLY : MYSETW, MYPROC, NPROC, D_NSTAGT0B, D_NSTAGTF,D_NPTRLS, D_NPNTGTB0, D_NPROCM, D_NDGL_FS + USE TPM_GEOMETRY ,ONLY : G_NMEN, G_NLOEN + USE TPM_HICFFT ,ONLY : EXECUTE_DIR_FFT + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + IMPLICIT NONE + + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) + REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(FTDIR_HANDLE) :: HFTDIR + + INTEGER(KIND=JPIM) :: KGL + + PREEL_COMPLEX => PREEL_REAL + +#ifdef ACCGPU + !$ACC DATA PRESENT(PREEL_REAL, PREEL_COMPLEX, & + !$ACC& D_NSTAGTF,D_NSTAGT0B,D_NPTRLS,D_NPROCM,D_NPNTGTB0,G_NMEN,G_NLOEN) +#endif +#ifdef OMPGPU + !$OMP TARGET DATA MAP(ALLOC:PREEL_COMPLEX) +#endif + + IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(413,0) + CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),KFIELD, & + & LOENS=G_NLOEN(D_NPTRLS(MYSETW):D_NPTRLS(MYSETW)+D_NDGL_FS-1), & + & OFFSETS=D_NSTAGTF(1:D_NDGL_FS+1),ALLOC=ALLOCATOR%PTR) + + IF (LSYNC_TRANS) THEN + CALL GSTATS(433,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(433,1) + ENDIF + CALL GSTATS(413,1) + +#ifdef ACCGPU + !$ACC END DATA +#endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif + + NULLIFY(PREEL_REAL) + + ! ------------------------------------------------------------------ + END SUBROUTINE FTDIR +END MODULE FTDIR_MOD diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 new file mode 100755 index 00000000..77a8cf22 --- /dev/null +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -0,0 +1,118 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE FTINV_MOD + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: FTINV, FTINV_HANDLE, PREPARE_FTINV + + TYPE FTINV_HANDLE + END TYPE +CONTAINS + FUNCTION PREPARE_FTINV(ALLOCATOR) RESULT(HFTINV) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + TYPE(FTINV_HANDLE) :: HFTINV + END FUNCTION + + SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) + !**** *FTINV - Inverse Fourier transform + + ! Purpose. Routine for Fourier to Grid-point transform + ! -------- + + !** Interface. + ! ---------- + ! CALL FTINV(..) + + ! Explicit arguments : PREEL - Fourier/grid-point array + ! -------------------- KFIELD - number of fields + + ! Method. + ! ------- + + ! Externals. FFT992 - FFT routine + ! ---------- + ! + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 00-03-03 + ! G. Radnoti 01-04-24 2D model (NLOEN=1) + ! D. Degrauwe (Feb 2012): Alternative extension zone (E') + ! G. Mozdzynski (Oct 2014): support for FFTW transforms + ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW + ! ------------------------------------------------------------------ + + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT + + USE TPM_DISTR ,ONLY : MYSETW, MYPROC, NPROC, D_NPTRLS, D_NDGL_FS, D_NSTAGTF + USE TPM_GEOMETRY ,ONLY : G_NLOEN + USE TPM_HICFFT ,ONLY : EXECUTE_INV_FFT + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + IMPLICIT NONE + + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) + REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(FTINV_HANDLE), INTENT(IN) :: HFTINV + + INTEGER(KIND=JPIM) :: KGL + + PREEL_REAL => PREEL_COMPLEX + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,D_NPTRLS,D_NDGL_FS,D_NSTAGTF,G_NLOEN) +#endif + + IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(423,0) + CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & + & LOENS=G_NLOEN(D_NPTRLS(MYSETW):D_NPTRLS(MYSETW)+D_NDGL_FS-1), & + & OFFSETS=D_NSTAGTF(1:D_NDGL_FS),ALLOC=ALLOCATOR%PTR) + + IF (LSYNC_TRANS) THEN + CALL GSTATS(443,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(443,1) + ENDIF + CALL GSTATS(423,1) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA +#endif + + NULLIFY(PREEL_COMPLEX) + + ! ------------------------------------------------------------------ + END SUBROUTINE FTINV +END MODULE FTINV_MOD diff --git a/src/trans/gpu/internal/gath_grid_32_ctl_mod.F90 b/src/trans/gpu/internal/gath_grid_32_ctl_mod.F90 new file mode 100755 index 00000000..9af5b018 --- /dev/null +++ b/src/trans/gpu/internal/gath_grid_32_ctl_mod.F90 @@ -0,0 +1,277 @@ +! (C) Copyright 2000- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE GATH_GRID_32_CTL_MOD +CONTAINS +SUBROUTINE GATH_GRID_32_CTL(PGPG,KFGATHG,KPROMA,KTO,PGP) + +!**** *GATH_GRID_32_CTL* - Gather global gridpoint array from processors + +! Purpose. +! -------- +! Routine for gathering gridpoint array + +!** Interface. +! ---------- +! CALL GATH_GRID_32_CTL(...) + +! Explicit arguments : +! -------------------- +! PGPG(:,:) - Global gridpoint array +! KFGATHG - Global number of fields to be gathered +! KPROMA - blocking factor for gridpoint input +! KTO(:) - Processor responsible for gathering each field +! PGP(:,:,:) - Local spectral array +! +! ------------------------------------------------------------------ + + +USE PARKIND1 ,ONLY : JPIM ,JPRM +USE MPL_MODULE + +USE TPM_GEN +USE TPM_DIM +USE TPM_GEOMETRY +USE TPM_DISTR + +USE SET2PE_MOD +USE EQ_REGIONS_MOD + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRM) ,OPTIONAL, INTENT(OUT) :: PGPG(:,:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KFGATHG +INTEGER(KIND=JPIM) , INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) , INTENT(IN) :: KTO(:) +REAL(KIND=JPRM) , INTENT(IN) :: PGP(:,:,:) + +! Declaration of local variables + +REAL(KIND=JPRM) :: ZFLD(D%NGPTOTMX*KFGATHG),ZDUM(D%NGPTOTMX) +REAL(KIND=JPRM),ALLOCATABLE :: ZBUF(:) +INTEGER(KIND=JPIM) :: IFLDR,JFLD,ITAG,ILEN,JA,JB,ISND,JGL,JLON,ILOFF,ILENB,IST +INTEGER(KIND=JPIM) :: IRCV,IOFF,ILAST,IGL1,IGL2,IGLOFF +INTEGER(KIND=JPIM) :: JKGLO,JROF,IEND,J,IBL,IPROC,JROC,IMYFIELDS,ILRECV +INTEGER(KIND=JPIM) :: ISENDREQ(NPROC),IOUNT,ITO +INTEGER(KIND=JPIM) :: ILENS(NPROC),IOFFS(NPROC),ILENR(NPROC),IOFFR(NPROC) +INTEGER(KIND=JPIM) :: IFLDL,IFLDS +LOGICAL :: LLSAME +! ------------------------------------------------------------------ + + +!GATHER SPECTRAL ARRAY + +IF( NPROC == 1 ) THEN + CALL GSTATS(1643,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF) + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JFLD=1,KFGATHG + DO JROF=1,IEND + PGPG(IOFF+JROF,JFLD) = PGP(JROF,JFLD,IBL) + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + CALL GSTATS(1643,1) + +ELSE +! test if values in KTO are all the same + LLSAME=.TRUE. + ITO=KTO(1) + DO JFLD=2,KFGATHG + IF(KTO(JFLD) /= ITO) THEN + LLSAME=.FALSE. + EXIT + ENDIF + ENDDO + IFLDL=D%NGPTOTMX + IF(LLSAME) THEN + CALL GSTATS(1643,0) + !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF) + DO JFLD=1,KFGATHG + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JROF=1,IEND + ZFLD(IOFF+JROF+(JFLD-1)*IFLDL) = PGP(JROF,JFLD,IBL) + ENDDO + ENDDO + ENDDO + !$OMP END PARALLEL DO + CALL GSTATS(1643,1) + ELSE + ILENS(:)=0 + IOFFS(:)=0 + ILENR(:)=0 + IOFFR(:)=0 + DO JFLD=1,KFGATHG + ILENS(KTO(JFLD))=ILENS(KTO(JFLD))+IFLDL + IF(KTO(JFLD) == MYPROC) THEN + ILENR(:)=ILENR(:)+IFLDL + ENDIF + ENDDO + DO JROC=2,NPROC + IOFFR(JROC)=IOFFR(JROC-1)+ ILENR(JROC-1) + IOFFS(JROC)=IOFFS(JROC-1)+ ILENS(JROC-1) + ENDDO + IFLDS=0 + DO JROC=1,NPROC + IF(ILENS(JROC) > 0) THEN + DO JFLD=1,KFGATHG + IF(KTO(JFLD) == JROC) THEN + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JROF=1,IEND + ZFLD(IOFF+JROF+IFLDS*IFLDL) = PGP(JROF,JFLD,IBL) + ENDDO + ENDDO + IFLDS=IFLDS+1 + ENDIF + ENDDO + ENDIF + ENDDO + ENDIF + + IMYFIELDS = 0 + DO JFLD=1,KFGATHG + IF(KTO(JFLD) == MYPROC) THEN + IMYFIELDS = IMYFIELDS+1 + ENDIF + ENDDO + + IF(IMYFIELDS > 0) THEN + ALLOCATE(ZBUF(D%NGPTOTMX*IMYFIELDS*NPROC)) + ELSE + ALLOCATE(ZBUF(1)) + ENDIF + IFLDR = 0 + CALL GSTATS_BARRIER(789) + CALL GSTATS(809,0) + + IF( LLSAME )THEN + !Send + ISND = KTO(1) + ITAG = MTAGDISTSP+1+17 + CALL MPL_SEND(ZFLD,KDEST=NPRCIDS(ISND),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(1),& + &CDSTRING='GATH_GRID_32_CTL:') + + ! RECIEVE + IF(KTO(1) == MYPROC) THEN + IFLDR = KFGATHG + DO JROC=1,NPROC + ITAG = MTAGDISTSP+1+17 + IRCV = JROC + IOFF=IFLDL*KFGATHG*(JROC-1) + CALL MPL_RECV(ZBUF(IOFF+1:IOFF+IFLDL*KFGATHG),KSOURCE=NPRCIDS(IRCV),& + &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILRECV,& + &KTAG=ITAG,CDSTRING='GATH_GRID_32_CTL:') + ENDDO + ENDIF + CALL MPL_WAIT(KREQUEST=ISENDREQ(1), & + & CDSTRING='GATH_GRID_32_CTL: WAIT') + ELSE + IFLDR=IMYFIELDS + CALL MPL_ALLTOALLV(PSENDBUF=ZFLD,KSENDCOUNTS=ILENS,& + & PRECVBUF=ZBUF,KRECVCOUNTS=ILENR,KSENDDISPL=IOFFS,KRECVDISPL=IOFFR,& + & CDSTRING='GATH_GRID_32_CTL:') +!!$ ITAG = MTAGDISTSP+1+17 +!!$ DO JROC=1,NPROC +!!$ ISND=JROC +!!$ IOFF=IOFFS(JROC) +!!$ ILEN=ILENS(JROC) +!!$ IF(ILEN > 0 ) THEN +!!$ CALL MPL_SEND(ZFLD(IOFF+1:IOFF+ILEN),KDEST=NPRCIDS(ISND),KTAG=ITAG,& +!!$ &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(ISND),& +!!$ &CDSTRING='GATH_GRID_32_CTL:') +!!$ ENDIF +!!$ ENDDO +!!$ DO JROC=1,NPROC +!!$ IRCV = JROC +!!$ IOFF = IOFFR(JROC) +!!$ ILEN = ILENR(JROC) +!!$ IF(ILEN > 0 ) THEN +!!$ CALL MPL_RECV(ZBUF(IOFF+1:IOFF+ILEN),KSOURCE=NPRCIDS(IRCV),& +!!$ &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILRECV,& +!!$ &KTAG=ITAG,CDSTRING='GATH_GRID_32_CTL:') +!!$ ENDIF +!!$ ENDDO +!!$ DO JROC=1,NPROC +!!$ ISND=JROC +!!$ ILEN=ILENS(JROC) +!!$ IF(ILEN > 0 ) THEN +!!$ CALL MPL_WAIT(KREQUEST=ISENDREQ(JROC), & +!!$ & CDSTRING='GATH_GRID_32_CTL: WAIT') +!!$ ENDIF +!!$ ENDDO + ENDIF + + CALL GSTATS(809,1) + CALL GSTATS_BARRIER2(789) + CALL GSTATS(1643,0) +!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1)& +!$OMP&PRIVATE(JA,JB,IPROC,IGLOFF,IGL1,IGL2,IOFF,ILAST,J,& +!$OMP&ILEN,ILOFF,JGL,JLON,JFLD) + DO JFLD=1,IFLDR + DO JA=1,N_REGIONS_NS + DO JB=1,N_REGIONS(JA) + CALL SET2PE(IPROC,JA,JB,0,0) + IGLOFF = D%NPTRFRSTLAT(JA) + IGL1 = D%NFRSTLAT(JA) + IGL2 = D%NLSTLAT(JA) + IOFF = 0 + IF(JA > 1) THEN + IF( D%NLSTLAT(JA-1) == D%NFRSTLAT(JA) )THEN + ILAST = D%NLSTLAT(JA-1)-1 + ELSE + ILAST = D%NLSTLAT(JA-1) + ENDIF + DO J=D%NFRSTLAT(1),ILAST + IOFF = IOFF+G%NLOEN(J) + ENDDO + ENDIF + + ILEN = 0 + ILOFF = 0 + DO JGL=IGL1,IGL2 + DO JLON=1,D%NONL(IGLOFF+JGL-IGL1,JB) + PGPG(IOFF+ILOFF+D%NSTA(IGLOFF+JGL-IGL1,JB)+JLON-1,JFLD) = & + & ZBUF(ILEN+JLON+(JFLD-1)*IFLDL+(IPROC-1)*IFLDL*IMYFIELDS) + ENDDO + ILEN = ILEN + D%NONL(IGLOFF+JGL-IGL1,JB) + ILOFF = ILOFF + G%NLOEN(JGL) + ENDDO + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + + CALL GSTATS(1643,1) +! Synhronize processors +! Should not be necessary +!!$ CALL GSTATS(784,0) +!!$ CALL MPL_BARRIER(CDSTRING='GATH_GRID_32_CTL:') +!!$ CALL GSTATS(784,1) + IF(ALLOCATED(ZBUF)) DEALLOCATE(ZBUF) +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE GATH_GRID_32_CTL +END MODULE GATH_GRID_32_CTL_MOD + + diff --git a/src/trans/gpu/internal/gath_grid_ctl_mod.F90 b/src/trans/gpu/internal/gath_grid_ctl_mod.F90 new file mode 100755 index 00000000..c8f5a3d4 --- /dev/null +++ b/src/trans/gpu/internal/gath_grid_ctl_mod.F90 @@ -0,0 +1,290 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE GATH_GRID_CTL_MOD +CONTAINS +SUBROUTINE GATH_GRID_CTL(PGPG,KFGATHG,KPROMA,KTO,PGP) + +!**** *GATH_GRID_CTL* - Gather global gridpoint array from processors + +! Purpose. +! -------- +! Routine for gathering gridpoint array + +!** Interface. +! ---------- +! CALL GATH_GRID_CTL(...) + +! Explicit arguments : +! -------------------- +! PGPG(:,:) - Global gridpoint array +! KFGATHG - Global number of fields to be gathered +! KPROMA - blocking factor for gridpoint input +! KTO(:) - Processor responsible for gathering each field +! PGP(:,:,:) - Local gridpoint array +! +! ------------------------------------------------------------------ + + +USE PARKIND1 ,ONLY : JPIM ,JPRB +USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_RECV, MPL_SEND, MPL_WAIT, & + & JP_BLOCKING_STANDARD, JP_NON_BLOCKING_STANDARD + +!USE TPM_GEN +!USE TPM_DIM +USE TPM_GEOMETRY ,ONLY : G +USE TPM_DISTR ,ONLY : D, MTAGDISTSP, NPRCIDS, MYPROC, NPROC + +USE SET2PE_MOD ,ONLY : SET2PE +USE EQ_REGIONS_MOD ,ONLY : N_REGIONS, N_REGIONS_NS + +IMPLICIT NONE + +! Declaration of arguments + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PGPG(:,:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KFGATHG +INTEGER(KIND=JPIM) , INTENT(IN) :: KPROMA +INTEGER(KIND=JPIM) , INTENT(IN) :: KTO(:) +REAL(KIND=JPRB) , INTENT(IN) :: PGP(:,:,:) + +! Declaration of local variables + +REAL(KIND=JPRB) :: ZFLD(D%NGPTOTMX*KFGATHG),ZDUM(D%NGPTOTMX) +REAL(KIND=JPRB),ALLOCATABLE :: ZBUF(:) +INTEGER(KIND=JPIM),ALLOCATABLE :: IREQ(:) +INTEGER(KIND=JPIM) :: IFLDR,JFLD,ITAG,ILEN,JA,JB,ISND,JGL,JLON,ILOFF,ILENB +INTEGER(KIND=JPIM) :: IRCV,IOFF,ILAST,IGL1,IGL2,IGLOFF,IR +INTEGER(KIND=JPIM) :: JKGLO,JROF,IEND,J,IBL,IPROC,JROC,IMYFIELDS,ILRECV +INTEGER(KIND=JPIM) :: ISENDREQ(KFGATHG),ITO +INTEGER(KIND=JPIM) :: ILENS(NPROC),IOFFS(NPROC),ILENR(NPROC),IOFFR(NPROC) +INTEGER(KIND=JPIM) :: IFLDL,IFLDS +LOGICAL :: LLSAME +! ------------------------------------------------------------------ + + +!GATHER SPECTRAL ARRAY + +IF( NPROC == 1 ) THEN + CALL GSTATS(1643,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF) + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JFLD=1,KFGATHG + DO JROF=1,IEND + PGPG(IOFF+JROF,JFLD) = PGP(JROF,JFLD,IBL) + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + CALL GSTATS(1643,1) + +ELSE +! test if values in KTO are all the same + LLSAME=.TRUE. + ITO=KTO(1) + DO JFLD=2,KFGATHG + IF(KTO(JFLD) /= ITO) THEN + LLSAME=.FALSE. + EXIT + ENDIF + ENDDO + + IFLDL=D%NGPTOTMX + IF(LLSAME) THEN + CALL GSTATS(1643,0) + !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF) + DO JFLD=1,KFGATHG + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JROF=1,IEND + ZFLD(IOFF+JROF+(JFLD-1)*IFLDL) = PGP(JROF,JFLD,IBL) + ENDDO + ENDDO + ENDDO + !$OMP END PARALLEL DO + CALL GSTATS(1643,1) + ELSE + ILENS(:)=0 + IOFFS(:)=0 + ILENR(:)=0 + IOFFR(:)=0 + DO JFLD=1,KFGATHG + ILENS(KTO(JFLD))=ILENS(KTO(JFLD))+IFLDL + IF(KTO(JFLD) == MYPROC) THEN + ILENR(:)=ILENR(:)+IFLDL + ENDIF + ENDDO + DO JROC=2,NPROC + IOFFR(JROC)=IOFFR(JROC-1)+ ILENR(JROC-1) + IOFFS(JROC)=IOFFS(JROC-1)+ ILENS(JROC-1) + ENDDO + IFLDS=0 + DO JROC=1,NPROC + IF(ILENS(JROC) > 0) THEN + DO JFLD=1,KFGATHG + IF(KTO(JFLD) == JROC) THEN + DO JKGLO=1,D%NGPTOT,KPROMA + IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1) + IOFF = JKGLO-1 + IBL = (JKGLO-1)/KPROMA+1 + DO JROF=1,IEND + ZFLD(IOFF+JROF+IFLDS*IFLDL) = PGP(JROF,JFLD,IBL) + ENDDO + ENDDO + IFLDS=IFLDS+1 + ENDIF + ENDDO + ENDIF + ENDDO + ENDIF + + IMYFIELDS = 0 + DO JFLD=1,KFGATHG + IF(KTO(JFLD) == MYPROC) THEN + IMYFIELDS = IMYFIELDS+1 + ENDIF + ENDDO + + IF(IMYFIELDS > 0) THEN + ALLOCATE(ZBUF(D%NGPTOTMX*IMYFIELDS*NPROC)) + ELSE + ALLOCATE(ZBUF(1)) + ENDIF + IFLDR = 0 + CALL GSTATS_BARRIER(789) + CALL GSTATS(809,0) + + IF( LLSAME )THEN + !Send + ISND = KTO(1) + ITAG = MTAGDISTSP+1+17 + CALL MPL_SEND(ZFLD,KDEST=NPRCIDS(ISND),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(1),& + &CDSTRING='GATH_GRID_CTL:') + ! RECIEVE + IF(KTO(1) == MYPROC) THEN + IFLDR = KFGATHG + DO JROC=1,NPROC + ITAG = MTAGDISTSP+1+17 + IRCV = JROC + IOFF=IFLDL*KFGATHG*(JROC-1) + CALL MPL_RECV(ZBUF(IOFF+1:IOFF+IFLDL*KFGATHG),KSOURCE=NPRCIDS(IRCV),& + &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILRECV,& + &KTAG=ITAG,CDSTRING='GATH_GRID_CTL:') + ENDDO + ENDIF + CALL MPL_WAIT(KREQUEST=ISENDREQ(1), & + & CDSTRING='GATH_GRID_CTL: WAIT') + ELSE + IFLDR=IMYFIELDS + +! ALLTOALLV performance is really slow when number of fields (KFGATHG) is << NPROC +! This was for IBM - and RECV/SEND alternative causes problems for large number of MPI tasks. + +! IF( KFGATHG >= NPROC/8 )THEN + IF( .TRUE. )THEN + CALL MPL_ALLTOALLV(PSENDBUF=ZFLD,KSENDCOUNTS=ILENS,& + & PRECVBUF=ZBUF,KRECVCOUNTS=ILENR,KSENDDISPL=IOFFS,KRECVDISPL=IOFFR,& + & CDSTRING='GATH_GRID_CTL:') + ELSE + IR=0 + DO JFLD=1,KFGATHG + IF(KTO(JFLD) == MYPROC) THEN + IR=IR+NPROC + ENDIF + ENDDO + IR=IR+KFGATHG + ALLOCATE(IREQ(IR)) + IR=0 + ITAG = MTAGDISTSP+1+17 + DO JROC=1,NPROC + DO JFLD=1,KFGATHG + IF(KTO(JFLD) == MYPROC) THEN + IRCV = JROC + IR=IR+1 + CALL MPL_RECV(ZBUF(1+IOFFR(IRCV):IOFFR(IRCV)+ILENR(IRCV)),KSOURCE=NPRCIDS(IRCV),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ(IR),& + &CDSTRING='GATH_GRID_CTL:') + ENDIF + ENDDO + ENDDO + DO JFLD=1,KFGATHG + ISND = KTO(JFLD) + IR=IR+1 + CALL MPL_SEND(ZFLD(1+IOFFS(ISND):IOFFS(ISND)+ILENS(ISND)),KDEST=NPRCIDS(ISND),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ(IR),& + &CDSTRING='GATH_GRID_CTL:') + ENDDO + CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & + & CDSTRING='GATH_GRID_CTL: WAIT') + DEALLOCATE(IREQ) + ENDIF + ENDIF + + CALL GSTATS(809,1) + CALL GSTATS_BARRIER2(789) + CALL GSTATS(1643,0) +!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1)& +!$OMP&PRIVATE(JA,JB,IPROC,IGLOFF,IGL1,IGL2,IOFF,ILAST,J,& +!$OMP&ILEN,ILOFF,JGL,JLON,JFLD) + DO JFLD=1,IFLDR + DO JA=1,N_REGIONS_NS + DO JB=1,N_REGIONS(JA) + CALL SET2PE(IPROC,JA,JB,0,0) + IGLOFF = D%NPTRFRSTLAT(JA) + IGL1 = D%NFRSTLAT(JA) + IGL2 = D%NLSTLAT(JA) + IOFF = 0 + IF(JA > 1) THEN + IF( D%NLSTLAT(JA-1) == D%NFRSTLAT(JA) )THEN + ILAST = D%NLSTLAT(JA-1)-1 + ELSE + ILAST = D%NLSTLAT(JA-1) + ENDIF + DO J=D%NFRSTLAT(1),ILAST + IOFF = IOFF+G%NLOEN(J) + ENDDO + ENDIF + + ILEN = 0 + ILOFF = 0 + DO JGL=IGL1,IGL2 + DO JLON=1,D%NONL(IGLOFF+JGL-IGL1,JB) + PGPG(IOFF+ILOFF+D%NSTA(IGLOFF+JGL-IGL1,JB)+JLON-1,JFLD) = & + & ZBUF(ILEN+JLON+(JFLD-1)*IFLDL+(IPROC-1)*IFLDL*IMYFIELDS) + ENDDO + ILEN = ILEN + D%NONL(IGLOFF+JGL-IGL1,JB) + ILOFF = ILOFF + G%NLOEN(JGL) + ENDDO + ENDDO + ENDDO + ENDDO +!$OMP END PARALLEL DO + + CALL GSTATS(1643,1) +! Synhronize processors +! Should not be necessary +!!$ CALL GSTATS(784,0) +!!$ CALL MPL_BARRIER(CDSTRING='GATH_GRID_CTL:') +!!$ CALL GSTATS(784,1) + IF(ALLOCATED(ZBUF)) DEALLOCATE(ZBUF) +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE GATH_GRID_CTL +END MODULE GATH_GRID_CTL_MOD + + diff --git a/src/trans/gpu/internal/gath_spec_control_mod.F90 b/src/trans/gpu/internal/gath_spec_control_mod.F90 new file mode 100755 index 00000000..88f7d213 --- /dev/null +++ b/src/trans/gpu/internal/gath_spec_control_mod.F90 @@ -0,0 +1,233 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE GATH_SPEC_CONTROL_MOD +CONTAINS +SUBROUTINE GATH_SPEC_CONTROL(PSPECG,KFGATHG,KTO,KVSET,PSPEC,LDIM1_IS_FLD,& + & KSMAX,KSPEC2,KSPEC2_G,KPOSSP,KDIM0G,LDZA0IP) + +!**** *GATH_SPEC_CONTROL* - Gather global spectral array from processors + +! Purpose. +! -------- +! Routine for gathering spectral array + +!** Interface. +! ---------- +! CALL GATH_SPEC_CONTROL(...) + +! Explicit arguments : +! -------------------- +! PSPECG(:,:) - Global spectral array +! KFGATHG - Global number of fields to be distributed +! KTO(:) - Processor responsible for distributing each field +! KVSET(:) - "B-Set" for each field +! PSPEC(:,:) - Local spectral array +! LDZA0IP - Set first coefficients (imaginary part) to zero + +! ------------------------------------------------------------------ + + +USE PARKIND1 ,ONLY : JPIM ,JPRB +USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_BARRIER, MPL_WAIT, & + & JP_BLOCKING_STANDARD, JP_NON_BLOCKING_STANDARD + +!USE TPM_GEN +!USE TPM_DIM +USE TPM_DISTR ,ONLY : MTAGDISTSP, NPRCIDS, NPRTRW, & + & MYSETV, MYSETW, MYPROC, NPROC +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + +USE SET2PE_MOD ,ONLY : SET2PE +!USE SUWAVEDI_MOD +! + +IMPLICIT NONE + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPECG(:,:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KFGATHG +INTEGER(KIND=JPIM) , INTENT(IN) :: KTO(:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KVSET(:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPEC(:,:) +LOGICAL ,OPTIONAL, INTENT(IN) :: LDIM1_IS_FLD +INTEGER(KIND=JPIM) , INTENT(IN) :: KSMAX +INTEGER(KIND=JPIM) , INTENT(IN) :: KSPEC2 +INTEGER(KIND=JPIM) , INTENT(IN) :: KSPEC2_G +INTEGER(KIND=JPIM) , INTENT(IN) :: KPOSSP(:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KDIM0G(0:) +LOGICAL ,OPTIONAL, INTENT(IN) :: LDZA0IP + +REAL(KIND=JPRB) :: ZFLD(KSPEC2,KFGATHG),ZDUM(KSPEC2) +REAL(KIND=JPRB),ALLOCATABLE :: ZRECV(:,:) +INTEGER(KIND=JPIM) :: JM,JN,II,IFLDR,IFLDS,JFLD,ITAG,IBSET,ILEN,JA,ISND +INTEGER(KIND=JPIM) :: IRCV,ISP,ILENR,ISTA,ISTP,ISENDREQ(KFGATHG),IPOS0,JNM +INTEGER(KIND=JPIM) :: IDIST(KSPEC2_G),IMYFIELDS +LOGICAL :: LLZA0IP + +! ------------------------------------------------------------------ + +LLZA0IP=.TRUE. +IF (PRESENT (LDZA0IP)) LLZA0IP=LDZA0IP + +!GATHER SPECTRAL ARRAY + +IF( NPROC == 1 ) THEN + CALL GSTATS(1644,0) + IF(LDIM1_IS_FLD) THEN +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JM,JFLD) + DO JM=1,KSPEC2_G + DO JFLD=1,KFGATHG + PSPECG(JFLD,JM) =PSPEC(JFLD,JM) + ENDDO + ENDDO +!$OMP END PARALLEL DO + ELSE +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JM,JFLD) + DO JFLD=1,KFGATHG + DO JM=1,KSPEC2_G + PSPECG(JM,JFLD) =PSPEC(JM,JFLD) + ENDDO + ENDDO +!$OMP END PARALLEL DO + ENDIF + CALL GSTATS(1644,1) +ELSE + IMYFIELDS = 0 + DO JFLD=1,KFGATHG + IF(KTO(JFLD) == MYPROC) THEN + IMYFIELDS = IMYFIELDS+1 + ENDIF + ENDDO + IF(IMYFIELDS>0) THEN + ALLOCATE(ZRECV(KSPEC2_G,IMYFIELDS)) + II = 0 + CALL GSTATS(1804,0) + DO JM=0,KSMAX + DO JN=JM,KSMAX + IDIST(II+1) = KDIM0G(JM)+(JN-JM)*2 + IDIST(II+2) = KDIM0G(JM)+(JN-JM)*2+1 + II = II+2 + ENDDO + ENDDO + CALL GSTATS(1804,1) + ENDIF + + CALL GSTATS_BARRIER(788) + + !Send + CALL GSTATS(810,0) + IFLDS = 0 + IF(KSPEC2 > 0 )THEN + DO JFLD=1,KFGATHG + + IBSET = KVSET(JFLD) + IF( IBSET == MYSETV )THEN + + IFLDS = IFLDS+1 + ISND = KTO(JFLD) + ITAG = MTAGDISTSP+JFLD+17 + IF(LDIM1_IS_FLD) THEN + ZFLD(1:KSPEC2,IFLDS)=PSPEC(IFLDS,1:KSPEC2) + CALL MPL_SEND(ZFLD(1:KSPEC2,IFLDS),KDEST=NPRCIDS(ISND),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(JFLD),& + &CDSTRING='GATH_SPEC_CONTROL') + ELSE + CALL MPL_SEND(PSPEC(1:KSPEC2,IFLDS),KDEST=NPRCIDS(ISND),KTAG=ITAG,& + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(JFLD),& + &CDSTRING='GATH_SPEC_CONTROL') + ENDIF + ENDIF + ENDDO + ENDIF + + ! Recieve + IFLDR = 0 + DO JFLD=1,KFGATHG + IF(KTO(JFLD) == MYPROC) THEN + IBSET = KVSET(JFLD) + IFLDR = IFLDR+1 + DO JA=1,NPRTRW + ILEN = KPOSSP(JA+1)-KPOSSP(JA) + IF( ILEN > 0 )THEN + CALL SET2PE(IRCV,0,0,JA,IBSET) + ITAG = MTAGDISTSP+JFLD+17 + ISTA = KPOSSP(JA) + ISTP = ISTA+ILEN-1 + CALL MPL_RECV(ZRECV(ISTA:ISTP,IFLDR),KSOURCE=NPRCIDS(IRCV),KTAG=ITAG,& + &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILENR, & + &CDSTRING='GATH_SPEC_CONTROL') + IF( ILENR /= ILEN )THEN + WRITE(0,'("GATH_SPEC_CONTROL: JFLD=",I4," JA=",I4," ILEN=",I10," ILENR=",I10)')& + &JFLD,JA,ILEN,ILENR + CALL ABORT_TRANS('GATH_SPEC_CONTROL:INVALID RECEIVE MESSAGE LENGTH') + ENDIF + ENDIF + ENDDO + ENDIF + ENDDO + + ! Check for completion of sends + IF(KSPEC2 > 0 )THEN + DO JFLD=1,KFGATHG + IBSET = KVSET(JFLD) + IF( IBSET == MYSETV )THEN + CALL MPL_WAIT(KREQUEST=ISENDREQ(JFLD), & + & CDSTRING='GATH_GRID_CTL: WAIT') + ENDIF + ENDDO + ENDIF + CALL GSTATS(810,1) + CALL GSTATS_BARRIER2(788) + + CALL GSTATS(1644,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JFLD,JNM,II,JN,ISP) + DO JFLD=1,IMYFIELDS + IF(LDIM1_IS_FLD) THEN + DO JNM=1,KSPEC2_G + PSPECG(JFLD,JNM) = ZRECV(IDIST(JNM),JFLD) + ENDDO + IF (LLZA0IP) THEN + II = 0 + DO JN=0,KSMAX + ISP = KDIM0G(0)+JN*2+1 + II = II+2 + PSPECG(JFLD,II) = 0.0_JPRB + ENDDO + ENDIF + ELSE + DO JNM=1,KSPEC2_G + PSPECG(JNM,JFLD) = ZRECV(IDIST(JNM),JFLD) + ENDDO + IF (LLZA0IP) THEN + II = 0 + DO JN=0,KSMAX + ISP = KDIM0G(0)+JN*2+1 + II = II+2 + PSPECG(II,JFLD) = 0.0_JPRB + ENDDO + ENDIF + ENDIF + ENDDO +!$OMP END PARALLEL DO + CALL GSTATS(1644,1) + IF(ALLOCATED(ZRECV)) DEALLOCATE(ZRECV) + + !Synchronize processors + CALL GSTATS(785,0) + CALL MPL_BARRIER(CDSTRING='GATH_SPEC_CONTROL:') + CALL GSTATS(785,1) +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE GATH_SPEC_CONTROL +END MODULE GATH_SPEC_CONTROL_MOD + + diff --git a/src/trans/gpu/internal/gawl_mod.F90 b/src/trans/gpu/internal/gawl_mod.F90 new file mode 100755 index 00000000..b42178f0 --- /dev/null +++ b/src/trans/gpu/internal/gawl_mod.F90 @@ -0,0 +1,118 @@ +! (C) Copyright 1992- ECMWF. +! (C) Copyright 1992- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE GAWL_MOD +CONTAINS +SUBROUTINE GAWL(PFN,PL,PW,PEPS,KN,KITER,PMOD) + +!**** *GAWL * - Routine to perform the Newton loop + +! Purpose. +! -------- +! Find 0 of Legendre polynomial with Newton loop +!** Interface. +! ---------- +! *CALL* *GAWL(PFN,PL,PW,PEPS,KN,KITER,PMOD) + +! Explicit arguments : +! -------------------- +! PFN Fourier coefficients of series expansion +! for the ordinary Legendre polynomials (in) +! PL Gaussian latitude (inout) +! PW Gaussian weight (out) +! PEPS 0 of the machine (in) +! KN Truncation (in) +! KITER Number of iterations (out) +! PMOD Last modification (inout) + +! Implicit arguments : +! -------------------- +! None + +! Method. +! ------- +! Newton Loop. + +! Externals. +! ---------- +! CPLEDN + +! Reference. +! ---------- + +! ARPEGE Documentation vol.2, ch3. + +! Author. +! ------- +! Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 92-12-18 +! K. Yessad (Sep 2008): cleaning, improve comments. +! Nils Wedi + Mats Hamrud, 2009-02-05 revised following Swarztrauber, 2002 +! F. Vana 05-Mar-2015 Support for single precision +! ------------------------------------------------------------------ + +USE EC_PARKIND ,ONLY : JPRD, JPIM + +USE CPLEDN_MOD ,ONLY : CPLEDN + +! ------------------------------------------------------------------ + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(IN) :: KN +REAL(KIND=JPRD),INTENT(IN) :: PFN(0:KN/2) +REAL(KIND=JPRD),INTENT(INOUT) :: PL +REAL(KIND=JPRD),INTENT(OUT) :: PW +REAL(KIND=JPRD),INTENT(IN) :: PEPS +INTEGER(KIND=JPIM),INTENT(OUT) :: KITER +REAL(KIND=JPRD),INTENT(INOUT) :: PMOD + +! ------------------------------------------------------------------ + + +INTEGER(KIND=JPIM) :: IFLAG, ITEMAX, JTER, IODD +REAL(KIND=JPRD) :: ZW, ZX, ZXN + +! ------------------------------------------------------------------ + +!* 1. Initialization. +! --------------- + +ITEMAX = 20 +ZX = PL +IFLAG = 0 +IODD=MOD(KN,2) + +! ------------------------------------------------------------------ + +!* 2. Newton iteration. +! ----------------- + +DO JTER=1,ITEMAX+1 + KITER = JTER + CALL CPLEDN(KN,IODD,PFN,ZX,IFLAG,ZW,ZXN,PMOD) + ZX = ZXN + + IF(IFLAG == 1) EXIT + IF(ABS(PMOD) <= PEPS*1000._JPRD) IFLAG = 1 +ENDDO + +PL = ZXN +PW = ZW + +! ------------------------------------------------------------------ + +END SUBROUTINE GAWL +END MODULE GAWL_MOD + + diff --git a/src/trans/gpu/internal/growing_allocator_mod.F90 b/src/trans/gpu/internal/growing_allocator_mod.F90 new file mode 100644 index 00000000..b0486307 --- /dev/null +++ b/src/trans/gpu/internal/growing_allocator_mod.F90 @@ -0,0 +1,91 @@ +MODULE GROWING_ALLOCATOR_MOD + + USE ISO_C_BINDING, ONLY: C_INT8_T + + PRIVATE + PUBLIC :: GROWING_ALLOCATION_TYPE + PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION + + ABSTRACT INTERFACE + SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C) + USE ISO_C_BINDING, ONLY: C_SIZE_T, C_INT8_T + IMPLICIT NONE + INTEGER(KIND=C_INT8_T), TARGET :: PTR(:) + INTEGER(C_SIZE_T), VALUE :: SZ + END SUBROUTINE + END INTERFACE + + TYPE FREE_FUNC_TYPE + PROCEDURE(FREE_FUNC_PROC), POINTER, NOPASS :: FUNC => NULL () + END TYPE + + TYPE GROWING_ALLOCATION_TYPE + INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) + TYPE(FREE_FUNC_TYPE) :: FREE_FUNCS(10) + INTEGER :: FREE_FUNCS_SZ + END TYPE + +CONTAINS + + SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ) + USE ISO_C_BINDING, ONLY: C_SIZE_T + IMPLICIT NONE + TYPE(GROWING_ALLOCATION_TYPE), INTENT(INOUT) :: ALLOC + INTEGER(C_SIZE_T) :: SZ + INTEGER :: I + + ! Deallocate existing pointer + IF (ASSOCIATED(ALLOC%PTR) .AND. SZ > SIZE(ALLOC%PTR, 1, C_SIZE_T)) THEN + PRINT *, "WARNING: REALLOCATING GROWING POINTER CAUSING GRAPH REINSTANTIATION" + DO I = 1, ALLOC%FREE_FUNCS_SZ + CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, & + SIZE(ALLOC%PTR, 1, C_SIZE_T)) + ENDDO + !$ACC EXIT DATA DELETE(ALLOC%PTR) + DEALLOCATE(ALLOC%PTR) + NULLIFY(ALLOC%PTR) + ENDIF + + IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN + ALLOCATE(ALLOC%PTR(SZ)) + !$ACC ENTER DATA CREATE(ALLOC%PTR) + ALLOC%FREE_FUNCS_SZ = 0 + ENDIF + END SUBROUTINE + + SUBROUTINE REGISTER_FREE_FUNCTION(ALLOC, FREE_FUNC) + IMPLICIT NONE + TYPE(GROWING_ALLOCATION_TYPE) :: ALLOC + PROCEDURE(FREE_FUNC_PROC) :: FREE_FUNC + + INTEGER :: I + + DO I = 1, ALLOC%FREE_FUNCS_SZ + IF (ASSOCIATED(ALLOC%FREE_FUNCS(I)%FUNC, FREE_FUNC)) & + RETURN + ENDDO + + ALLOC%FREE_FUNCS_SZ = ALLOC%FREE_FUNCS_SZ + 1 + IF (ALLOC%FREE_FUNCS_SZ > SIZE(ALLOC%FREE_FUNCS)) THEN + PRINT *, "TOO MANY FREE FUNCTIONS REGISTERED" + STOP 4 + ENDIF + ALLOC%FREE_FUNCS(ALLOC%FREE_FUNCS_SZ)%FUNC => FREE_FUNC + END SUBROUTINE + + SUBROUTINE REGISTER_FREE_C(ALLOC_C, FREE_FUNC_C) BIND(C, NAME="growing_allocator_register_free_c") + USE ISO_C_BINDING, ONLY: C_FUNPTR, C_PTR, C_F_PROCPOINTER, C_F_POINTER + IMPLICIT NONE + TYPE(C_PTR), VALUE :: ALLOC_C + TYPE(C_FUNPTR), VALUE :: FREE_FUNC_C + + TYPE(GROWING_ALLOCATION_TYPE), POINTER :: ALLOC + PROCEDURE(FREE_FUNC_PROC), POINTER :: FREE_FUNC + + CALL C_F_POINTER(ALLOC_C, ALLOC) + CALL C_F_PROCPOINTER(FREE_FUNC_C, FREE_FUNC) + CALL REGISTER_FREE_FUNCTION(ALLOC, FREE_FUNC) + + END SUBROUTINE + +END MODULE diff --git a/src/trans/gpu/internal/inigptr_mod.F90 b/src/trans/gpu/internal/inigptr_mod.F90 new file mode 100755 index 00000000..e1fcd781 --- /dev/null +++ b/src/trans/gpu/internal/inigptr_mod.F90 @@ -0,0 +1,88 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE INIGPTR_MOD +CONTAINS +SUBROUTINE INIGPTR(KGPTRSEND,KGPTRRECV) + +! Compute tables to assist GP to/from Fourier space transpositions + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +USE TPM_GEN ,ONLY : NOUT +USE TPM_DISTR ,ONLY : D, NPRTRNS +USE TPM_TRANS ,ONLY : NGPBLKS, NPROMA +USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +! + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(OUT) :: KGPTRSEND(2,NGPBLKS,NPRTRNS) +INTEGER(KIND=JPIM),INTENT(OUT) :: KGPTRRECV(NPRTRNS) + +INTEGER(KIND=JPIM) :: IBLOCK,IROF,IBFIRST,IPROCLAST,IPROC,IFIRST,ILAST,IBLAST +INTEGER(KIND=JPIM) :: JGL,JBL,JPRTRNS,JBLKS +! Compute tables to assist GP to/from Fourier space transpositions + + +KGPTRSEND(1,:,:)=0 +KGPTRSEND(2,:,:)=-1 +IBLOCK=1 +IROF=1 +IBFIRST=1 +IPROCLAST=D%NPROCL(D%NFRSTLOFF+1) +! for each latitude on this processor +DO JGL=1,D%NDGL_GP + ! find the processor where this row should be saved in the fourier distribution + ! this is called the "w-set" + IPROC=D%NPROCL(D%NFRSTLOFF+JGL) + ! for each latitude on this processor, find first and last points + ! for each NPROMA chunk, for each destination processor + IF(IPROC /= IPROCLAST) THEN + ! we got onto a new process, we still need to finish the last block of the previous + ! process + IF(IROF > 1) THEN + KGPTRSEND(1,IBLOCK,IPROCLAST)=IBFIRST + KGPTRSEND(2,IBLOCK,IPROCLAST)=IROF-1 + ENDIF + IF(IROF <= NPROMA) IBFIRST=IROF + IPROCLAST=IPROC + ENDIF + ! my offset of the first gridpoint in this row (globally, in EW-direction) + IFIRST=D%NSTA(D%NPTRFLOFF+JGL,MY_REGION_EW) + ! my offset of the last gridpoint in this row (globally, in EW-direction) + ILAST =IFIRST + D%NONL(D%NPTRFLOFF+JGL,MY_REGION_EW) -1 + ! now go through all gridpoints on this latitude + DO JBL=IFIRST,ILAST + IF(IROF == NPROMA) THEN + ! this block is full! + IBLAST=IROF + KGPTRSEND(1,IBLOCK,IPROC)=IBFIRST + KGPTRSEND(2,IBLOCK,IPROC)=IBLAST + IF(IBLOCK < NGPBLKS) IBLOCK=IBLOCK+1 + IROF=0 + IBFIRST=1 + ENDIF + IROF=IROF+1 + ENDDO +ENDDO +IF(IROF /= 1.AND.IROF /= IBFIRST) THEN + ! non-empty residual block after last latitude line + IBLAST=IROF-1 + KGPTRSEND(1,IBLOCK,IPROC)=IBFIRST + KGPTRSEND(2,IBLOCK,IPROC)=IBLAST +ENDIF +! sum up over blocks +KGPTRRECV(:)=SUM(KGPTRSEND(2,:,:),1)-SUM(KGPTRSEND(1,:,:),1)+NGPBLKS + +END SUBROUTINE INIGPTR +END MODULE INIGPTR_MOD diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 new file mode 100644 index 00000000..d00e5379 --- /dev/null +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -0,0 +1,242 @@ +! (C) Copyright 2001- ECMWF. +! (C) Copyright 2001- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE INV_TRANS_CTL_MOD +CONTAINS + SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& + & KF_UV,KF_SCALARS,KF_SCDERS,& + & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,FSPGL_PROC,& + & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) + + !**** *INV_TRANS_CTL* - Control routine for inverse spectral transform. + + ! Purpose. + ! -------- + ! Control routine for the inverse spectral transform + + !** Interface. + ! ---------- + ! CALL INV_TRANS_CTL(...) + + ! Explicit arguments : + ! -------------------- + ! KF_UV_G - global number of spectral u-v fields + ! KF_SCALARS_G - global number of scalar spectral fields + ! KF_GP - total number of output gridpoint fields + ! KF_FS - total number of fields in fourier space + ! KF_OUT_LT - total number of fields coming out from inverse LT + ! KF_UV - local number of spectral u-v fields + ! KF_SCALARS - local number of scalar spectral fields + ! KF_SCDERS - local number of derivatives of scalar spectral fields + ! PSPVOR(:,:) - spectral vorticity (input) + ! PSPDIV(:,:) - spectral divergence (input) + ! PSPSCALAR(:,:) - spectral scalarvalued fields (input) + ! KVSETUV(:) - indicating which 'b-set' in spectral space owns a + ! vor/div field. Equivalant to NBSETLEV in the IFS. + ! The length of KVSETUV should be the GLOBAL number + ! of u/v fields which is the dimension of u and v releated + ! fields in grid-point space. + ! KVESETSC(:) - indicating which 'b-set' in spectral space owns a + ! scalar field. As for KVSETUV this argument is required + ! if the total number of processors is greater than + ! the number of processors used for distribution in + ! spectral wave space. + ! FSPGL_PROC - external procedure to be executed in fourier space + ! before transposition + ! PGP(:,:,:) - gridpoint fields (output) + + ! The ordering of the output fields is as follows (all + ! parts are optional depending on the input switches): + + ! vorticity : KF_UV_G fields + ! divergence : KF_UV_G fields + ! u : KF_UV_G fields + ! v : KF_UV_G fields + ! scalar fields : KF_SCALARS_G fields + ! N-S derivative of scalar fields : KF_SCALARS_G fields + ! E-W derivative of u : KF_UV_G fields + ! E-W derivative of v : KF_UV_G fields + ! E-W derivative of scalar fields : KF_SCALARS_G fields + + ! Method. + ! ------- + + ! Externals. SHUFFLE - reshuffle fields for load balancing + ! ---------- FIELD_SPLIT - split fields in NPROMATR packets + ! LTINV_CTL - control of Legendre transform + ! FTINV_CTL - control of Fourier transform + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 01-01-03 + + ! ------------------------------------------------------------------ + + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD + USE ISO_C_BINDING, ONLY: C_INT8_T + + USE TPM_GEN ,ONLY : NPROMATR, NOUT + USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, GROWING_ALLOCATION + USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + USE BUFFERED_ALLOCATOR_MOD + + USE TRMTOL_MOD + USE LTINV_MOD + USE TRMTOL_PACK_UNPACK + USE FSC_MOD + USE FTINV_MOD + USE TRLTOG_MOD + + IMPLICIT NONE + + ! Declaration of arguments + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV_G + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G + INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCDERS + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPVOR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPDIV(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSCALAR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3A(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3B(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC2(:,:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) + REAL(KIND=JPRB) ,OPTIONAL ,INTENT(OUT) :: PGP(:,:,:) + EXTERNAL FSPGL_PROC + OPTIONAL FSPGL_PROC + REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGPUV(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) + + ! Local variables + + REAL(KIND=JPRB), POINTER :: FOUBUF(:), FOUBUF_IN(:) + REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:), PREEL_COMPLEX(:) + REAL(KIND=JPRBT), POINTER :: ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), POINTER :: ZOUTS0(:), ZOUTA0(:) + INTEGER(KIND=JPIM) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & + & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET + INTEGER(KIND=JPIM) :: IF_LEG, IF_FOURIER + + INTEGER(KIND=JPIM) :: IFIRST + + TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR + TYPE(LTINV_HANDLE) :: HLTINV + TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK + TYPE(TRMTOL_HANDLE) :: HTRMTOL + TYPE(TRMTOL_UNPACK_HANDLE) :: HTRMTOL_UNPACK + TYPE(FSC_HANDLE) :: HFSC + TYPE(FTINV_HANDLE) :: HFTINV + TYPE(TRLTOG_HANDLE) :: HTRLTOG + + INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) + + ! ------------------------------------------------------------------ + + IF(NPROMATR > 0) THEN + print *, "This is currently not supported and/or tested (NPROMATR > 0j" + stop 24 + ENDIF + + ! Compute Vertical domain decomposition + + ! Initialize potentially unset offsets + KSCALARS_NSDER_OFFSET = -1 + KUV_EWDER_OFFSET = -1 + KSCALARS_EWDER_OFFSET = -1 + + ! (note in ltinv we will initially start with a slightly different domain decomposition + ! which always has vorticity and divergence because this is the actual input) + IFIRST = 0 + IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity + IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence + KUV_OFFSET = IFIRST + IFIRST = IFIRST + KF_UV ! U + IFIRST = IFIRST + KF_UV ! V + KSCALARS_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars + IF (LSCDERS) THEN + KSCALARS_NSDER_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives + ENDIF + ! the rest of fields is being computed in fourier space, namely in FSC + IF_LEG = IFIRST + IF (LUVDER) THEN + KUV_EWDER_OFFSET = IFIRST + IFIRST = IFIRST+2*KF_UV ! U and V derivatives + ENDIF + IF (LSCDERS) THEN + KSCALARS_EWDER_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives + ENDIF + IF_FOURIER = IFIRST + IF (IF_FOURIER /= KF_FS) CALL ABORT_TRANS('Size mismatch: Wrong computation KF_FS') + + ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() + HLTINV = PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) + HTRMTOL_PACK = PREPARE_TRMTOL_PACK(ALLOCATOR,IF_LEG) + HTRMTOL = PREPARE_TRMTOL(ALLOCATOR,IF_LEG) + HTRMTOL_UNPACK = PREPARE_TRMTOL_UNPACK(ALLOCATOR,IF_FOURIER) + HFSC = PREPARE_FSC(ALLOCATOR) + HFTINV = PREPARE_FTINV(ALLOCATOR) + HTRLTOG = PREPARE_TRLTOG(ALLOCATOR,IF_FOURIER,KF_GP) + + CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) + + IF (KF_FS > 0) THEN + ! Legendre transformations + CALL GSTATS(102,0) + CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & + & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) + CALL GSTATS(102,1) + + ! Packing into send buffer, to fourier space and unpack + CALL GSTATS(152,0) + CALL TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) + CALL TRMTOL(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) + CALL TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) + CALL GSTATS(152,1) + + CALL GSTATS(107,0) + ! compute NS derivatives + CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, IF_FOURIER, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & + & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) + !Legendre transformations + CALL FTINV(ALLOCATOR, HFTINV, PREEL_COMPLEX,PREEL_REAL,IF_FOURIER) + CALL GSTATS(107,1) + ENDIF + + ! Transposition into grid-point space + CALL GSTATS(157,0) + CALL TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,IF_FOURIER,KF_GP,KF_UV_G,KF_SCALARS_G,& + & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& + & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& + & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) + CALL GSTATS(157,1) + + END SUBROUTINE INV_TRANS_CTL +END MODULE INV_TRANS_CTL_MOD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 new file mode 100755 index 00000000..1effbae6 --- /dev/null +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -0,0 +1,404 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE LEDIR_MOD + USE PARKIND_ECTRANS ,ONLY : JPIM +! USE TPM_TRANS, ONLY: LEDIR_CONFIG + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: LEDIR_STRIDES, LEDIR + + INTEGER(KIND=JPIM) :: A = 8 !Alignment +CONTAINS + SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R + USE TPM_DISTR, ONLY: D,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 + + IMPLICIT NONE + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE + + IF (PRESENT(IOUT_STRIDES0)) & + IOUT_STRIDES0 = ALIGN(2*KF_FS,A) + IF (PRESENT(IOUT_SIZE)) & + IOUT_SIZE = IOUT_STRIDES0*D_OFFSETS_GEMM2(D%NUMP+1) + IF (PRESENT(IIN_STRIDES0)) & + IIN_STRIDES0 = ALIGN(2*KF_FS,A) + IF (PRESENT(IIN_SIZE)) & + IIN_SIZE = IIN_STRIDES0*D_OFFSETS_GEMM1(D%NUMP+1) + IF (PRESENT(IOUT0_STRIDES0)) & + IOUT0_STRIDES0 = ALIGN(KF_FS,A) + IF (PRESENT(IOUT0_SIZE)) & + IOUT0_SIZE = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IIN0_STRIDES0)) & + IIN0_STRIDES0 = ALIGN(KF_FS,A) + IF (PRESENT(IIN0_SIZE)) & + IIN0_SIZE = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) + END SUBROUTINE + + SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) + !**** *LEDIR* - Direct Legendre transform. + + ! Purpose. + ! -------- + ! Direct Legendre tranform of state variables. + + !** Interface. + ! ---------- + ! CALL LEDIR(...) + + ! Explicit arguments : KM - zonal wavenumber + ! -------------------- KFC - number of field to transform + ! fields for zonal wavenumber KM + ! PSIA - symmetric part of Fourier + ! fields for zonal wavenumber KM + ! POA1 - spectral + ! fields for zonal wavenumber KM + + ! Implicit arguments : None. + ! -------------------- + + ! Method. + ! ------- use butterfly or dgemm + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Nils Wedi + Mats Hamrud + George Modzynski + + ! Modifications. + ! -------------- + ! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: + ! F. Vana 05-Mar-2015 Support for single precision + ! ------------------------------------------------------------------ + + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE TPM_DIM ,ONLY : R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL + USE TPM_GEOMETRY ,ONLY : G_NDGLU + USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 + USE TPM_DISTR ,ONLY : D_NUMP,D_MYMS,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 + USE HICBLAS_MOD ,ONLY : HIP_GEMM_BATCHED, HIP_DGEMM_BATCHED_OVERLOAD, & + & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE, INTRINSIC :: ISO_C_BINDING + USE IEEE_ARITHMETIC + USE OPENACC + +#ifdef TRANS_SINGLE +#define HIP_GEMM HIP_SGEMM_GROUPED_OVERLOAD +#else +#define HIP_GEMM HIP_DGEMM_GROUPED_OVERLOAD +#endif + + IMPLICIT NONE + + ! DUMMY ARGUMENTS + REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) + REAL(KIND=JPRBT), INTENT(INOUT) :: ZOUT(:) + REAL(KIND=JPRD), INTENT(INOUT) :: ZOUT0(:) + REAL(KIND=JPRBT), INTENT(OUT), POINTER :: POA1(:,:,:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + + ! LOCAL VARIABLES + INTEGER(KIND=JPIM) :: KM + INTEGER(KIND=JPIM) :: KMLOC + INTEGER(KIND=JPIM) :: IA, IS, ISL, J + INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2 + + INTEGER(KIND=JPIM) :: IGLS, JF, JGL + INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 + + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS + + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) + + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA & + !$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & + !$ACC& PRESENT(D_MYMS,D_NUMP,R_NTMAX,R_NSMAX,G_NDGLU) & + !$ACC& PRESENT(ZAA,ZAS,POA1,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) +#endif + + ! anti-symmetric + IF(KMLOC0 > 0) THEN + PRINT*,'computing m=0 in double precision' + ENDIF + + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(414,0) + + IF(KMLOC0 > 0) THEN + ! compute m=0 in double precision: +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(ZAA0,ZINPA0,ZOUT0) +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZAA0,ZINPA0,ZOUT0) +#endif + CALL HIP_DGEMM_BATCHED_OVERLOAD( & + & 'N', 'N', & + & KF_FS, (R_NSMAX+2)/2, G_NDGLU(0), & + & 1.0_JPRD, & + & ZINPA0, IIN0_STRIDES0, 0, & + & ZAA0, SIZE(ZAA0,1), 0, & + & 0.0_JPRD, & + & ZOUT0, IOUT0_STRIDES0, 0, & + & 1, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + ENDIF + ! Get C in transpose format to get better memory access patterns later + !C=A*B => + ! C^T=B^T*A^T + DO KMLOC=1,D_NUMP + KM = D_MYMS(KMLOC) + NS(KMLOC) = (R_NSMAX-KM+2)/2 + KS(KMLOC) = G_NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) + BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) + ENDDO + IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 + ENDIF +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(ZAA,ZINPA,ZOUT) +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZAA,ZINPA,ZOUT) +#endif + CALL HIP_GEMM( & + & 21, & ! unique identifier + & 'N', 'N', & + & 2*KF_FS, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINPA, IIN_STRIDES0, AOFFSETS, & + & ZAA, SIZE(ZAA,1), BOFFSETS, & + & 0.0_JPRBT, & + & ZOUT, IOUT_STRIDES0, COFFSETS, & + & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(434,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(434,1) + ENDIF + CALL GSTATS(414,1) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) FIRSTPRIVATE(KF_FS,IOUT_STRIDES0,IOUT0_STRIDES0) DEFAULT(NONE) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JF=1,2*KF_FS + KM = D_MYMS(KMLOC) + IA = 1+MOD(R_NTMAX-KM+2,2) + IF (KM /= 0) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC LOOP SEQ +#endif + DO J=1,(R_NSMAX-KM+2)/2 + POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IOUT_STRIDES0) + ENDDO + ELSEIF (MOD(JF-1,2) == 0) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC LOOP SEQ +#endif + DO J=1,(R_NSMAX+2)/2 + POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) + ENDDO + ENDIF + ENDDO + ENDDO + + ! symmetric + + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(414,0) + + IF(KMLOC0 > 0) THEN +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(ZAS0,ZINPS0,ZOUT0) +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZAS0,ZINPS0,ZOUT0) +#endif + ! compute m=0 in double precision: + call HIP_DGEMM_BATCHED_OVERLOAD( & + & 'N', 'N', & + & KF_FS, (R_NSMAX+3)/2, G_NDGLU(0), & + & 1.0_JPRD, & + & ZINPS0, IIN0_STRIDES0, 0, & + & ZAS0, SIZE(ZAS0,1), 0, & + & 0.0_JPRD, & + & ZOUT0, IOUT0_STRIDES0, 0, & + & 1, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + ENDIF + + ! Get C in transpose format to get better memory access patterns later + !C=A*B => + ! C^T=B^T*A^T + DO KMLOC=1,D_NUMP + KM = D_MYMS(KMLOC) + NS(KMLOC) = (R_NSMAX-KM+3)/2 + KS(KMLOC) = G_NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) + BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) + ENDDO + IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 + ENDIF +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(ZAS,ZINPS,ZOUT) +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZAS,ZINPS,ZOUT) +#endif + CALL HIP_GEMM( & + & 22, & ! unique identifier + & 'N', 'N', & + & 2*KF_FS, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINPS, IIN_STRIDES0, AOFFSETS, & + & ZAS, SIZE(ZAS,1), BOFFSETS, & + & 0.0_JPRBT, & + & ZOUT, IOUT_STRIDES0, COFFSETS, & + & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(434,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(434,1) + ENDIF + CALL GSTATS(414,1) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) FIRSTPRIVATE(KF_FS,IOUT_STRIDES0,IOUT0_STRIDES0) DEFAULT(NONE) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JF=1,2*KF_FS + KM = D_MYMS(KMLOC) + IS = 1+MOD(R_NTMAX-KM+1,2) + IF (KM /= 0) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC LOOP SEQ +#endif + DO J=1,(R_NSMAX-KM+3)/2 + POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IOUT_STRIDES0) + ENDDO + ELSEIF (MOD(JF-1,2) == 0) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC LOOP SEQ +#endif + DO J=1,(R_NSMAX+3)/2 + POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) + ENDDO + ENDIF + ENDDO + ENDDO +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC WAIT(1) + + !$ACC END DATA +#endif + + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ + END SUBROUTINE LEDIR +END MODULE LEDIR_MOD diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 new file mode 100755 index 00000000..3d9c5642 --- /dev/null +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -0,0 +1,418 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE LEINV_MOD + USE PARKIND_ECTRANS ,ONLY : JPIM + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: LEINV_STRIDES, LEINV + + INTEGER(KIND=JPIM) :: A = 8 !Alignment + +CONTAINS + SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R + USE TPM_DISTR, ONLY: D,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 + + IMPLICIT NONE + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG + + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE + + + IF (PRESENT(IOUT_STRIDES0)) & + IOUT0_STRIDES0 = ALIGN(KF_LEG,A) + IF (PRESENT(IOUT0_SIZE)) & + IOUT0_SIZE = IOUT0_STRIDES0 * ALIGN(R%NDGNH,A) + IF (PRESENT(IIN_STRIDES0)) & + IIN_STRIDES0 = ALIGN(2*KF_LEG,A) + IF (PRESENT(IIN_SIZE)) & + IIN_SIZE = IIN_STRIDES0*D_OFFSETS_GEMM2(D%NUMP+1) + IF (PRESENT(IOUT0_STRIDES0)) & + IOUT_STRIDES0 = ALIGN(2*KF_LEG,A) + IF (PRESENT(IOUT_SIZE)) & + IOUT_SIZE = IOUT_STRIDES0*D_OFFSETS_GEMM1(D%NUMP+1) + IF (PRESENT(IIN0_STRIDES0)) & + IIN0_STRIDES0 = ALIGN(KF_LEG,A) + IF (PRESENT(IIN0_SIZE)) & + IIN0_SIZE = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + END SUBROUTINE LEINV_STRIDES + + SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) + !**** *LEINV* - Inverse Legendre transform. + + ! Purpose. + ! -------- + ! Inverse Legendre tranform of all variables(kernel). + + !** Interface. + ! ---------- + ! CALL LEINV(...) + + ! Explicit arguments : KM - zonal wavenumber (input-c) + ! -------------------- KFC - number of fields to tranform (input-c) + ! PIA - spectral fields + ! for zonal wavenumber KM (input) + + ! Implicit arguments : None. + ! -------------------- + + ! Method. + ! ------- + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Nils Wedi + Mats Hamrud + George Modzynski + ! + ! Modifications. + ! -------------- + ! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: + ! F. Vana 05-Mar-2015 Support for single precision + ! ------------------------------------------------------------------ + + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB, JPRBT, JPRD + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE TPM_DIM ,ONLY : R_NDGNH,R_NSMAX, R_NDGL + USE TPM_GEOMETRY ,ONLY : G_NDGLU + USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 + USE TPM_DISTR ,ONLY : D_NUMP,D_MYMS,MYPROC,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 + USE HICBLAS_MOD ,ONLY : HIP_GEMM_BATCHED, HIP_DGEMM_BATCHED_OVERLOAD, & + & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD +#ifdef TRANS_SINGLE +#define HIP_GEMM HIP_SGEMM_GROUPED_OVERLOAD +#else +#define HIP_GEMM HIP_DGEMM_GROUPED_OVERLOAD +#endif + + USE, INTRINSIC :: ISO_C_BINDING + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + IMPLICIT NONE + + REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG + REAL(KIND=JPRBT), INTENT(OUT) :: ZINP(:), ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), INTENT(OUT) :: ZINP0(:), ZOUTS0(:), ZOUTA0(:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + + ! LOCAL + INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + INTEGER(KIND=JPIM) :: KM, KMLOC, IA, IS, ISL, J1, JGL, JK, J + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + + + !* 1.1 PREPARATIONS. + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) + + ! ------------------------------------------------------------------ + + !* 1. PERFORM LEGENDRE TRANFORM. + ! -------------------------- + + !* 1.1 PREPARATIONS. + + CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) + + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA PRESENT(D_MYMS,D_NUMP,G_NDGLU) & + !$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & + !$ACC& PRESENT(ZAA,ZAS,PIA) & + !$ACC& PRESENT(R_NSMAX,G_NDGLU,D_OFFSETS_GEMM2) +#endif + + IF (KMLOC0 > 0) THEN + print*,'computing m=0 in double precision' + ENDIF + + ! READ 2:NSMAX+3 + + !IF KM=0 and NSMAX is 6: + ! IA=1 + ! DO=1,6/2+1 ... 1..4 + ! PIA_2=1+1+(J-1)*2 ...2+(0..3)*2 .... 2,4,6,8 + !IF KM=0 and NSMAX is 7: + ! IA=2 + ! DO=1,7/2+1 ... 1..4 + ! PIA_2=2+1+(1..4-1)*2 ...3+(0..3)*2 .... 3,5,7,9 + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) FIRSTPRIVATE(KF_LEG,IIN_STRIDES0,IIN0_STRIDES0) DEFAULT(NONE) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JK=1,2*KF_LEG + KM = D_MYMS(KMLOC) + IA = 1+MOD(R_NSMAX-KM+2,2) + IF(KM /= 0)THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC LOOP SEQ +#endif + DO J=1,(R_NSMAX-KM+2)/2 + ZINP(JK+(J-1)*IIN_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IIN_STRIDES0)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ENDDO + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC LOOP SEQ +#endif + DO J=1,(R_NSMAX+2)/2 + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IA+1+(J-1)*2,KMLOC) + ENDDO + ENDIF + ENDDO + ENDDO + + + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(424,0) + + IF (KMLOC0 > 0) THEN + ! compute m=0 in double precision +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(ZAA0,ZINP0,ZOUTA0) +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZAA0,ZINP0,ZOUTA0) +#endif + CALL HIP_DGEMM_BATCHED_OVERLOAD( & + & 'N', 'T', & + & KF_LEG, G_NDGLU(0), (R_NSMAX+2)/2, & + & 1.0_JPRD, & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAA0, SIZE(ZAA0,1), 0, & + & 0.0_JPRD, & + & ZOUTA0, IOUT0_STRIDES0, 0, & + & 1, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + ENDIF + + + + DO KMLOC=1,D_NUMP + KM = D_MYMS(KMLOC) + KS(KMLOC) = (R_NSMAX-KM+2)/2 + NS(KMLOC) = G_NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) + BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) + ENDDO + IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 + ENDIF +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(ZAA,ZINP,ZOUTA) +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUTA) +#endif + CALL HIP_GEMM( & + & 11, & ! unique identifier + & 'N', 'T', & + & 2*KF_LEG, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINP, IIN_STRIDES0, AOFFSETS, & + & ZAA, SIZE(ZAA,1), BOFFSETS, & + & 0.0_JPRBT, & + & ZOUTA, IOUT_STRIDES0, COFFSETS, & + & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(444,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(444,1) + ENDIF + CALL GSTATS(424,1) + + ! 2. +++++++++++++ symmetric + !IF KM=0 and NSMAX is 6: + ! IS=2 + ! DO=1,4 + ! PIA_2=2+1+(0..3)*2 ... 3+(0..3)*2 ... 3,5,7,9 + !IF KM=0 and NSMAX is 7: + ! IS=1 + ! DO=1,5 + ! PIA_2=1+1+(1..5-1)*2 ...2+(0..4)*2 .... 2,4,6,8,10 + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) FIRSTPRIVATE(KF_LEG,IIN_STRIDES0,IIN0_STRIDES0) DEFAULT(NONE) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JK=1,2*KF_LEG + KM = D_MYMS(KMLOC) + IS = 1+MOD(R_NSMAX-KM+1,2) + IF(KM /= 0) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC LOOP SEQ +#endif + DO J=1,(R_NSMAX-KM+3)/2 + ZINP(JK+(J-1)*IIN_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IIN_STRIDES0)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ENDDO + ELSEIF (MOD((JK-1),2) == 0) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC LOOP SEQ +#endif + DO J=1,(R_NSMAX+3)/2 + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IS+1+(J-1)*2,KMLOC) + ENDDO + ENDIF + ENDDO + ENDDO + + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(424,0) + + IF (KMLOC0 > 0) THEN +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(ZAS0,ZINP0,ZOUTS0) +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZAS0,ZINP0,ZOUTS0) +#endif + CALL HIP_DGEMM_BATCHED_OVERLOAD( & + & 'N', 'T', & + & KF_LEG, G_NDGLU(0), (R_NSMAX+3)/2, & + & 1.0_JPRD, & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAS0, SIZE(ZAS0,1), 0, & + & 0.0_JPRD, & + & ZOUTS0, IOUT0_STRIDES0, 0, & + & 1, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + ENDIF + + DO KMLOC=1,D_NUMP + KM = D_MYMS(KMLOC) + KS(KMLOC) = (R_NSMAX-KM+3)/2 + NS(KMLOC) = G_NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) + BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) + ENDDO + IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 + ENDIF +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(ZAS,ZINP,ZOUTS) +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUTS) +#endif + CALL HIP_GEMM( & + & 12, & ! unique identifier + & 'N', 'T', & + & 2*KF_LEG, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINP, IIN_STRIDES0, AOFFSETS, & + & ZAS, SIZE(ZAS,1), BOFFSETS, & + & 0.0_JPRBT, & + & ZOUTS, IOUT_STRIDES0, COFFSETS, & + & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(444,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(444,1) + ENDIF + CALL GSTATS(424,1) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC WAIT(1) + + !$ACC END DATA +#endif + + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ + END SUBROUTINE LEINV +END MODULE LEINV_MOD diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 new file mode 100755 index 00000000..3f3f2eb0 --- /dev/null +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -0,0 +1,294 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 1987- ECMWF. +! (C) Copyright 1987- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE LTDIR_MOD + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: PREPARE_LTDIR, LTDIR_HANDLE, LTDIR + + TYPE LTDIR_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUT_AND_POA + END TYPE + +CONTAINS + FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTDIR) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE TPM_DISTR, ONLY: D + USE TPM_DIM, ONLY: R + USE ISO_C_BINDING + USE LEDIR_MOD + USE BUFFERED_ALLOCATOR_MOD + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV + TYPE(LTDIR_HANDLE) :: HLTDIR + + INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + REAL(KIND=JPRD) :: ZPRD_DUMMY + + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) + + ! POA1 + IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ! POA2 + IALLOC_SZ = IALLOC_SZ + ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ! ZOUT + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ! ZOUT0 + IALLOC_SZ = IALLOC_SZ+ ALIGN(IOUT0_SIZE*SIZEOF(ZPRD_DUMMY),128) + + HLTDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ) + END FUNCTION PREPARE_LTDIR + + SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,& + & PSPSC3A,PSPSC3B,PSPSC2, & + & KFLDPTRUV,KFLDPTRSC) + + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD, JPRB + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + + USE TPM_DIM ,ONLY : R + USE TPM_DISTR ,ONLY : D + USE TPM_GEOMETRY + + USE PREPSNM_MOD ,ONLY : PREPSNM + USE LEDIR_MOD + USE UVTVD_MOD + USE UPDSP_MOD ,ONLY : UPDSP + USE UPDSPB_MOD ,ONLY : UPDSPB + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE BUFFERED_ALLOCATOR_MOD + USE ISO_C_BINDING, ONLY: C_SIZE_T, C_F_POINTER, C_LOC + + + !**** *LTDIR* - Control of Direct Legendre transform step + + ! Purpose. + ! -------- + ! Tranform from Fourier space to spectral space, compute + ! vorticity and divergence. + + !** Interface. + ! ---------- + ! *CALL* *LTDIR(...)* + + ! Explicit arguments : + ! -------------------- KM - zonal wavenumber + ! KMLOC - local zonal wavenumber + + ! Implicit arguments : None + ! -------------------- + + ! Method. + ! ------- + + ! Externals. + ! ---------- + ! PREPSNM - prepare REPSNM for wavenumber KM + ! PRFI2 - prepares the Fourier work arrays for model variables. + ! LEDIR - direct Legendre transform + ! UVTVD - + ! UPDSP - updating of spectral arrays (fields) + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Mats Hamrud and Philippe Courtier *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 87-11-24 + ! Modified : 91-07-01 Philippe Courtier/Mats Hamrud - Rewrite + ! for uv formulation + ! Modified 93-03-19 D. Giard - CDCONF='T' for tendencies + ! Modified 93-11-18 M. Hamrud - use only one Fourier buffer + ! Modified 94-04-06 R. El khatib Full-POS implementation + ! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div + ! instead of u,v->vor,div + ! MPP Group : 95-10-01 Support for Distributed Memory version + ! K. YESSAD (AUGUST 1996): + ! - Legendre transforms for transmission coefficients. + ! Modified : 04/06/99 D.Salmond : change order of AIA and SIA + ! R. El Khatib 12-Jul-2012 LDSPC2 replaced by UVTVD + ! ------------------------------------------------------------------ + + IMPLICIT NONE + + ! DUMMY INTEGER SCALARS + INTEGER(KIND=JPIM) :: KM + INTEGER(KIND=JPIM) :: KMLOC + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS + + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC2(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3A(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3B(:,:,:) + INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) + INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) + REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) + + ! LOCAL INTEGER SCALARS + INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU, IFIRST + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + REAL(KIND=JPRB), POINTER :: POA1_L(:), POA1(:,:,:) + REAL(KIND=JPRB), POINTER :: POA2_L(:), POA2(:,:,:) + REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) + REAL(KIND=JPRBT), POINTER :: ZOUT(:) + REAL(KIND=JPRD), POINTER :: ZOUT0(:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(LTDIR_HANDLE), INTENT(IN) :: HLTDIR + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE + + + + ! ------------------------------------------------------------------ + IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',0,ZHOOK_HANDLE) + + ! ------------------------------------------------------------------ + + !* 1. PREPARE LEGENDRE POLONOMIALS AND EPSNM + ! -------------------------------------- + + + ! ------------------------------------------------------------------ + + !* 2. PREPARE WORK ARRAYS. + ! -------------------- + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) + + IALLOC_POS = 1 + + IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(POA1_L(1)),128) + CALL ASSIGN_PTR(POA1_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) + CALL C_F_POINTER(C_LOC(POA1_L), POA1, (/ 2*KF_FS, R%NTMAX+3, D%NUMP /)) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + IALLOC_SZ = ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(POA2_L(1)),128) + CALL ASSIGN_PTR(POA2_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) + CALL C_F_POINTER(C_LOC(POA2_L), POA2, (/ 4*KF_UV, R%NTMAX+3, D%NUMP /)) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUT + IALLOC_SZ = ALIGN(IOUT_SIZE*SIZEOF(ZOUT(1)),128) + CALL ASSIGN_PTR(ZOUT, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUT0 + IALLOC_SZ = ALIGN(IOUT0_SIZE*SIZEOF(ZOUT0(1)),128) + CALL ASSIGN_PTR(ZOUT0, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! do the legendre transform + CALL LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA COPYOUT(PSPVOR,PSPDIV) IF(KF_UV > 0) + !$ACC DATA COPYOUT(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) + !$ACC DATA COPYOUT(PSPSC2) IF(NF_SC2 > 0) + !$ACC DATA COPYOUT(PSPSC3A) IF(NF_SC3A > 0) + !$ACC DATA COPYOUT(PSPSC3B) IF(NF_SC3B > 0) +#endif + + ! ------------------------------------------------------------------ + + !* 5. COMPUTE VORTICITY AND DIVERGENCE. + ! --------------------------------- + + IF( KF_UV > 0 ) THEN + ! U and V are in POA1 + IFIRST = 0 + PU => POA1(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV + PV => POA1(IFIRST+1:IFIRST+2*KF_UV,:,:) + ! Compute VOR and DIV ino POA2 + IFIRST = 0 + PVOR => POA2(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV + PDIV => POA2(IFIRST+1:IFIRST+2*KF_UV,:,:) + + ! Compute vorticity and divergence + CALL UVTVD(KF_UV,PU,PV,PVOR,PDIV) + + ! Write back. Note, if we have UV, the contract says we *must* have VOR/DIV + CALL UPDSPB(KF_UV,PVOR,PSPVOR,KFLDPTRUV) + CALL UPDSPB(KF_UV,PDIV,PSPDIV,KFLDPTRUV) + + ENDIF + ! ------------------------------------------------------------------ + + !* 6. UPDATE SPECTRAL ARRAYS. + ! ----------------------- + + ! this is on the host, so need to cp from device, Nils + CALL UPDSP(KF_UV,KF_SCALARS,POA1,& + & PSPSCALAR,& + & PSPSC3A,PSPSC3B,PSPSC2 , & + & KFLDPTRUV,KFLDPTRSC) + + !$ACC WAIT(1) + + IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(412,0) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA +#endif + IF (LSYNC_TRANS) THEN + CALL GSTATS(432,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(432,1) + ENDIF + CALL GSTATS(412,1) + + ! ------------------------------------------------------------------ + + IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',1,ZHOOK_HANDLE) + END SUBROUTINE LTDIR +END MODULE LTDIR_MOD diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 new file mode 100755 index 00000000..861aea97 --- /dev/null +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -0,0 +1,409 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE LTINV_MOD + USE BUFFERED_ALLOCATOR_MOD + + IMPLICIT NONE + + PRIVATE + PUBLIC :: LTINV, LTINV_HANDLE, PREPARE_LTINV + + TYPE LTINV_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPIA_AND_IN + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUTS_AND_OUTA + END TYPE + +CONTAINS + FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT(HLTINV) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE TPM_DISTR, ONLY: D + USE TPM_DIM, ONLY: R + USE ISO_C_BINDING + USE LEINV_MOD + USE BUFFERED_ALLOCATOR_MOD + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV,KF_SCALARS + LOGICAL, INTENT(IN) :: LVORGP,LDIVGP,LSCDERS + + TYPE(LTINV_HANDLE) :: HLTINV + + INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ, IPIA_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + REAL(KIND=JPRD) :: ZPRD_DUMMY + + INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG + + ! # fields that are initially read. We always read vorticity + ! and divergence! Also keep in mind that we actually have 2X + ! this number of levels because real+complex + IF_READIN = 0 + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! U + IF_READIN = IF_READIN + KF_UV ! V + IF_READIN = IF_READIN + KF_SCALARS ! Scalars + IF (LSCDERS) & + IF_READIN = IF_READIN + KF_SCALARS ! Scalars NS Derivatives + + IPIA_SZ = ALIGN(2*IF_READIN*(R%NSMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + + ! In Legendre space, we then ignore vorticity/divergence, if + ! they don't need to be transformed. + IF_LEG = IF_READIN + IF(.NOT. LVORGP) IF_LEG = IF_LEG - KF_UV ! No vorticity needed + IF(.NOT. LDIVGP) IF_LEG = IF_LEG - KF_UV ! No divergence needed + + CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) + + ! PIA + IALLOC_SZ = IPIA_SZ + ! ZINP + IALLOC_SZ = IALLOC_SZ + ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ! ZINP0 + IALLOC_SZ = IALLOC_SZ + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) + + HLTINV%HPIA_AND_IN = RESERVE(ALLOCATOR, IALLOC_SZ) + + IALLOC_SZ = 0 + ! ZOUTA + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ! ZOUTS + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ! ZOUTA0 + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_SIZE*SIZEOF(ZPRD_DUMMY),128) + ! ZOUTS0 + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_SIZE*SIZEOF(ZPRD_DUMMY),128) + + HLTINV%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ) + + END FUNCTION PREPARE_LTINV + + SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & + & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + + USE TPM_DIM ,ONLY : R + USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, LSCDERS + USE TPM_FLT + USE TPM_GEOMETRY + USE TPM_DISTR ,ONLY : D + USE PRFI1B_MOD ,ONLY : PRFI1B + USE VDTUV_MOD ,ONLY : VDTUV + USE SPNSDE_MOD ,ONLY : SPNSDE + USE LEINV_MOD + USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + use ieee_arithmetic + USE TPM_FIELDS ,ONLY : F,ZEPSNM + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + + !**** *LTINV* - Inverse Legendre transform + ! + ! Purpose. + ! -------- + ! Tranform from Laplace space to Fourier space, compute U and V + ! and north/south derivatives of state variables. + + !** Interface. + ! ---------- + ! *CALL* *LTINV(...) + + ! Explicit arguments : + ! -------------------- + ! KM - zonal wavenumber + ! KMLOC - local zonal wavenumber + ! PSPVOR - spectral vorticity + ! PSPDIV - spectral divergence + ! PSPSCALAR - spectral scalar variables + + ! Implicit arguments : The Laplace arrays of the model. + ! -------------------- The values of the Legendre polynomials + ! The grid point arrays of the model + ! Method. + ! ------- + + ! Externals. + ! ---------- + + ! PREPSNM - prepare REPSNM for wavenumber KM + ! PRFI1B - prepares the spectral fields + ! VDTUV - compute u and v from vorticity and divergence + ! SPNSDE - compute north-south derivatives + ! LEINV - Inverse Legendre transform + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + ! Temperton, 1991, MWR 119 p1303 + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 00-02-01 From LTINV in IFS CY22R1 + ! ------------------------------------------------------------------ + + IMPLICIT NONE + + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS + + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPVOR(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPDIV(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSCALAR(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC2(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3A(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3B(:,:,:) + REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), POINTER, INTENT(OUT) :: ZOUTS0(:), ZOUTA0(:) + + INTEGER(KIND=JPIM) :: IFIRST, J3 + + REAL(KIND=JPRB), POINTER :: PIA_L(:), PIA(:,:,:) + REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) + REAL(KIND=JPRB), POINTER :: PSCALARS(:,:,:), PSCALARS_NSDER(:,:,:) + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(LTINV_HANDLE), INTENT(IN) :: HLTINV + + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE + + INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + + REAL(KIND=JPRBT), POINTER :: ZINP(:) + REAL(KIND=JPRD), POINTER :: ZINP0(:) + + ! ------------------------------------------------------------------ + + !* 1. PERFORM LEGENDRE TRANFORM. + ! -------------------------- + + IF (LHOOK) CALL DR_HOOK('LTINV_MOD',0,ZHOOK_HANDLE) + + ! Get all pointers + IF_READIN = 0 + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! U + IF_READIN = IF_READIN + KF_UV ! V + IF_READIN = IF_READIN + KF_SCALARS ! Scalars + IF (LSCDERS) & + IF_READIN = IF_READIN + KF_SCALARS ! Scalars NS Derivatives + + ! In Legendre space, we then ignore vorticity/divergence, if + ! they don't need to be transformed. + IF_LEG = IF_READIN + IF(.NOT. LVORGP) IF_LEG = IF_LEG - KF_UV ! No vorticity needed + IF(.NOT. LDIVGP) IF_LEG = IF_LEG - KF_UV ! No divergence needed + + CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) + + IALLOC_POS = 1 + + ! PIA + IALLOC_SZ = ALIGN(2*IF_READIN*(R%NTMAX+3)*D%NUMP*SIZEOF(PIA_L(1)),128) + CALL ASSIGN_PTR(PIA_L, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& + & IALLOC_POS, IALLOC_SZ) + CALL C_F_POINTER(C_LOC(PIA_L), PIA, (/ 2*IF_READIN, R%NTMAX+3, D%NUMP /)) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZINP + IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINP(1)),128) + CALL ASSIGN_PTR(ZINP, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZINP0 + IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINP0(1)),128) + CALL ASSIGN_PTR(ZINP0, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + IALLOC_POS = 1 + + ! ZOUTA + IALLOC_SZ = ALIGN(IOUT_SIZE*SIZEOF(ZOUTA(1)),128) + CALL ASSIGN_PTR(ZOUTA, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUTS + IALLOC_SZ = ALIGN(IOUT_SIZE*SIZEOF(ZOUTS(1)),128) + CALL ASSIGN_PTR(ZOUTS, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUTA0 + IALLOC_SZ = ALIGN(IOUT0_SIZE*SIZEOF(ZOUTA0(1)),128) + CALL ASSIGN_PTR(ZOUTA0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUTS0 + IALLOC_SZ = ALIGN(IOUT0_SIZE*SIZEOF(ZOUTS0(1)),128) + CALL ASSIGN_PTR(ZOUTS0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! Assign pointers do the different components of PIA + IFIRST = 0 + IF (.NOT. LVORGP .OR. LDIVGP) THEN + ! Usually we want to store vorticity first + PVOR => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Vorticity + + PDIV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Divergence + ELSE + ! Except if we want to translate Vorticity but not Divergence, we should have Divergence first + ! Then we have all buffers that move on in a contiguous buffer + PDIV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Divergence + + PVOR => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Vorticity + ENDIF + PU => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! U + PV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! V + PSCALARS => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) + IFIRST = IFIRST + 2*KF_SCALARS ! Scalars + IF (LSCDERS) THEN + PSCALARS_NSDER => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) + IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives + ENDIF + + ! ------------------------------------------------------------------ + + + !* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. + ! ---------------------------------------------- + + IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(422,0) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) + !$ACC DATA COPYIN(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) + !$ACC DATA COPYIN(PSPSC2) IF(NF_SC2 > 0) + !$ACC DATA COPYIN(PSPSC3A) IF(NF_SC3A > 0) + !$ACC DATA COPYIN(PSPSC3B) IF(NF_SC3B > 0) +#endif + IF (LSYNC_TRANS) THEN + CALL GSTATS(442,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(442,1) + ENDIF + CALL GSTATS(422,1) + + IF (KF_UV > 0) THEN + CALL PRFI1B(PVOR,PSPVOR,KF_UV,UBOUND(PSPVOR,2)) + CALL PRFI1B(PDIV,PSPDIV,KF_UV,UBOUND(PSPDIV,2)) + + ! Compute U and V for VOR and DIV + CALL VDTUV(KF_UV,ZEPSNM,PVOR,PDIV,PU,PV) + ENDIF + + IF (KF_SCALARS > 0) THEN + IF(PRESENT(PSPSCALAR)) THEN + CALL PRFI1B(PSCALARS,PSPSCALAR,KF_SCALARS,UBOUND(PSPSCALAR,2)) + ELSE + IFIRST = 1 + IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN + CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC2-1,:,:),PSPSC2(:,:),NF_SC2,UBOUND(PSPSC2,2)) + IFIRST = IFIRST+2*NF_SC2 + ENDIF + IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN + DO J3=1,UBOUND(PSPSC3A,3) + CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3A-1,:,:),PSPSC3A(:,:,J3),NF_SC3A,UBOUND(PSPSC3A,2)) + IFIRST = IFIRST+2*NF_SC3A + ENDDO + ENDIF + IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN + DO J3=1,UBOUND(PSPSC3B,3) + CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3B-1,:,:),PSPSC3B(:,:,J3),NF_SC3B,UBOUND(PSPSC3B,2)) + IFIRST = IFIRST+2*NF_SC3B + ENDDO + ENDIF + IF(IFIRST-1 /= 2*KF_SCALARS) THEN + WRITE(0,*) 'LTINV:KF_SCALARS,IFIRST',KF_SCALARS,IFIRST + CALL ABORT_TRANS('LTINV_MOD:IFIRST /= 2*KF_SCALARS') + ENDIF + ENDIF + ENDIF + + ! Compute NS derivatives if needed + IF (LSCDERS) THEN + CALL SPNSDE(KF_SCALARS,ZEPSNM,PSCALARS,PSCALARS_NSDER) + ENDIF + +#ifdef OMPGPU + !$OMP END TARGET DATA + !$OMP END TARGET DATA + !$OMP END TARGET DATA + !$OMP END TARGET DATA + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC WAIT(1) + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA +#endif + + ! ------------------------------------------------------------------ + + + !* 4. INVERSE LEGENDRE TRANSFORM. + ! --------------------------- + + ! Legendre transforms. When converting PIA into ZOUT, we ignore the first entries of LEINV. + ! This is because vorticity and divergence is not necessarily converted to GP space. + CALL LEINV(ALLOCATOR,PIA(2*(IF_READIN-IF_LEG)+1:IF_READIN,:,:),ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,IF_LEG) + + IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ + END SUBROUTINE LTINV +END MODULE LTINV_MOD + diff --git a/src/trans/gpu/internal/myrecvset_mod.F90 b/src/trans/gpu/internal/myrecvset_mod.F90 new file mode 100755 index 00000000..093323f8 --- /dev/null +++ b/src/trans/gpu/internal/myrecvset_mod.F90 @@ -0,0 +1,83 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE MYRECVSET_MOD +CONTAINS +FUNCTION MYRECVSET(KSETS,KMYSET,KSET) + + +!**** *MYRECVSET* RETURNS SET NUMBER TO SEND TO + +! Purpose. +! -------- +! + +!** Interface. +! ---------- +! ISENDSET = MYRECVSET(KSETS,KMYSET,KSET) + +! Explicit arguments : +! -------------------- +! input: KSETS + +! Implicit arguments : NONE +! -------------------- +! Method. +! ------- + +! + +! Externals. +! ---------- +! NONE + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-02-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +! + +IMPLICIT NONE +INTEGER(KIND=JPIM) :: MYRECVSET +INTEGER(KIND=JPIM),INTENT(IN) :: KSETS,KMYSET,KSET + + +! ------------------------------------------------------------------ + +!* 1. Check input argument for validity +! --------------------------------- + +IF(KSETS < 1 .OR. KMYSET > KSETS .OR. KSET > KSETS-1) THEN + + CALL ABORT_TRANS(' MYRECVSET: INVALID ARGUMENT ') + +ELSE + +!* 2. Compute output parameters +! ------------------------- + + MYRECVSET = MOD(-KSET-1+KMYSET+KSETS,KSETS)+1 + +ENDIF + +END FUNCTION MYRECVSET +END MODULE MYRECVSET_MOD diff --git a/src/trans/gpu/internal/mysendset_mod.F90 b/src/trans/gpu/internal/mysendset_mod.F90 new file mode 100755 index 00000000..636025e3 --- /dev/null +++ b/src/trans/gpu/internal/mysendset_mod.F90 @@ -0,0 +1,80 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE MYSENDSET_MOD +CONTAINS +FUNCTION MYSENDSET(KSETS,KMYSET,KSET) + + +!**** *MYSENDSET* RETURNS SET NUMBER TO SEND TO + +! Purpose. +! -------- +! + +!** Interface. +! ---------- +! ISENDSET = MYSENDSET(KSETS,KMYSET,KSET) + +! Explicit arguments : +! -------------------- +! input: KSETS + +! Implicit arguments : NONE +! -------------------- +! Method. +! ------- + +! Externals. +! ---------- +! NONE + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 00-02-03 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + +IMPLICIT NONE +INTEGER(KIND=JPIM) :: MYSENDSET +INTEGER(KIND=JPIM),INTENT(IN) :: KSETS,KMYSET,KSET + + +! ------------------------------------------------------------------ + +!* 1. Check input argument for validity +! --------------------------------- + +IF(KSETS < 1 .OR. KMYSET > KSETS .OR. KSET > KSETS-1) THEN + + CALL ABORT_TRANS(' MYSENDSET: INVALID ARGUMENT ') + +ELSE + +!* 2. Compute output parameters +! ------------------------- + + MYSENDSET = MOD(KMYSET+KSET-1,KSETS)+1 + +ENDIF + +END FUNCTION MYSENDSET +END MODULE MYSENDSET_MOD diff --git a/src/trans/gpu/internal/parkind_ectrans.F90 b/src/trans/gpu/internal/parkind_ectrans.F90 new file mode 100644 index 00000000..d2d88033 --- /dev/null +++ b/src/trans/gpu/internal/parkind_ectrans.F90 @@ -0,0 +1,38 @@ +! (C) Copyright 2021- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE PARKIND_ECTRANS +! +! Re-export precision-related symbols defined in fiat / parkind1, +! and add ECTRANS-internal precision-related symbols + +USE PARKIND1 +! +IMPLICIT NONE +SAVE +! +! Real Kind of compile-time precision for internal trans use +! ---------------------------------------------------------- +! +#ifdef PARKINDTRANS_SINGLE +INTEGER, PARAMETER :: JPRBT = SELECTED_REAL_KIND(6,37) +#else +INTEGER, PARAMETER :: JPRBT = SELECTED_REAL_KIND(13,300) +#endif + + +! +! Half precision +! -------------- + +!!INTEGER, PARAMETER :: JPRL = 2 + + + +END MODULE PARKIND_ECTRANS diff --git a/src/trans/gpu/internal/pe2set_mod.F90 b/src/trans/gpu/internal/pe2set_mod.F90 new file mode 100755 index 00000000..9a8ce8d5 --- /dev/null +++ b/src/trans/gpu/internal/pe2set_mod.F90 @@ -0,0 +1,121 @@ +! (C) Copyright 1998- ECMWF. +! (C) Copyright 1998- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE PE2SET_MOD +CONTAINS +SUBROUTINE PE2SET(KPE,KPRGPNS,KPRGPEW,KPRTRW,KPRTRV) + + +!**** *PE2SET* - Convert from PE number to set numbers + +! Purpose. +! -------- +! Convert from PE number to set numbers in both +! grid-point space and spectral space + +!** Interface. +! ---------- +! *CALL* *PE2SET(KPE,KPRGPNS,KPRGPEW,KPRTRW,KPRTRV) + +! Explicit arguments : +! -------------------- +! input: KPE - integer processor number +! in the range 1 .. NPROC +! output: KPRGPNS - integer A set number in grid space +! in the range 1 .. NPRGPNS +! KPRGPEW - integer B set number in grid space +! in the range 1 .. NPRGPEW +! KPRTRW - integer A set number in spectral space +! in the range 1 .. NPRTRW +! KPRTRV - integer B set number in spectral space +! in the range 1 .. NPRTRV + +! Implicit arguments : YOMMP parameters +! NPRGPNS,NPRGPEW,NPRTRW,NPRTRV,NPROC + +! -------------------- +! Method. +! ------- + +! PE allocation order is row oriented (e.g. NPRGPNS or NPRTRW = 4): + +! 1 2 3 4 +! 5 6 7 8 +! 9 10 11 12 +! 13 14 15 16 +! . . . . + +! Externals. +! ---------- +! NONE + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! David Dent *ECMWF* + +! Modifications. +! -------------- +! Original : 98-08-19 +! Revision : 98-10-13 row ordering +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +USE TPM_DISTR ,ONLY : LEQ_REGIONS, NPRGPEW, NPROC, NPRTRV +USE EQ_REGIONS_MOD ,ONLY : N_REGIONS, N_REGIONS_NS +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +! + +IMPLICIT NONE +INTEGER(KIND=JPIM),INTENT(IN) :: KPE +INTEGER(KIND=JPIM),INTENT(OUT) :: KPRGPNS,KPRGPEW,KPRTRW,KPRTRV + +INTEGER(KIND=JPIM) :: IPE,JA +! ------------------------------------------------------------------ + +!* 1. Check input argument for validity +! --------------------------------- + +IF(KPE <= 0.OR.KPE > NPROC) THEN + WRITE(*,'(A,2I8)') ' PE2SET INVALID ARGUMENT ',KPE,NPROC + CALL ABORT_TRANS(' PE2SET INVALID ARGUMENT ') + +ELSE + +!* 2. Compute output parameters +! ------------------------- + + IF( LEQ_REGIONS )THEN + KPRGPNS=1 + IPE=KPE + DO JA=1,N_REGIONS_NS + IF( IPE > N_REGIONS(JA) )THEN + IPE=IPE-N_REGIONS(JA) + KPRGPNS=KPRGPNS+1 + CYCLE + ENDIF + KPRGPEW=IPE + EXIT + ENDDO + ELSE + KPRGPEW=MOD(KPE-1,NPRGPEW)+1 + KPRGPNS=(KPE-1)/NPRGPEW+1 + ENDIF + KPRTRV =MOD(KPE-1,NPRTRV)+1 + KPRTRW =(KPE-1)/NPRTRV+1 + +ENDIF + +END SUBROUTINE PE2SET +END MODULE PE2SET_MOD diff --git a/src/trans/gpu/internal/pre_suleg_mod.F90 b/src/trans/gpu/internal/pre_suleg_mod.F90 new file mode 100755 index 00000000..02434109 --- /dev/null +++ b/src/trans/gpu/internal/pre_suleg_mod.F90 @@ -0,0 +1,71 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE PRE_SULEG_MOD +IMPLICIT NONE +CONTAINS +SUBROUTINE PRE_SULEG +USE PARKIND1 ,ONLY : JPRD, JPIM +USE TPM_GEN ,ONLY : NPRINTLEV,NOUT +USE TPM_DIM ,ONLY : R +USE TPM_CONSTANTS ,ONLY: RA +USE TPM_DISTR ,ONLY : D +USE TPM_FIELDS,ONLY : F + +INTEGER(KIND=JPIM) :: IM, ICOUNT,JMLOC,JN +LOGICAL :: LLP1,LLP2 + + +LLP1 = NPRINTLEV>0 +LLP2 = NPRINTLEV>1 + +ICOUNT = 0 +DO JMLOC=1,D%NUMP + IM = D%MYMS(JMLOC) + DO JN=IM,R%NTMAX+2 + ICOUNT = ICOUNT+1 + ENDDO +ENDDO + +ALLOCATE(F%REPSNM(ICOUNT)) +IF (LLP2) WRITE(NOUT,9) 'F%REPSNM ',SIZE(F%REPSNM ),SHAPE(F%REPSNM ) +ALLOCATE(F%RN(-1:R%NTMAX+3)) +IF (LLP2) WRITE(NOUT,9) 'F%RN ',SIZE(F%RN ),SHAPE(F%RN ) +ALLOCATE(F%RLAPIN(-1:R%NSMAX+2)) +IF (LLP2) WRITE(NOUT,9) 'F%RLAPIN ',SIZE(F%RLAPIN ),SHAPE(F%RLAPIN ) +ALLOCATE(F%NLTN(-1:R%NTMAX+3)) +IF (LLP2) WRITE(NOUT,9) 'F%NLTN ',SIZE(F%NLTN ),SHAPE(F%NLTN ) + +ICOUNT = 0 +DO JMLOC=1,D%NUMP + IM = D%MYMS(JMLOC) + DO JN=IM,R%NTMAX+2 + ICOUNT = ICOUNT+1 + F%REPSNM(ICOUNT) = SQRT(REAL(JN*JN-IM*IM,JPRD)/& + &REAL(4*JN*JN-1,JPRD)) + ENDDO +ENDDO + +DO JN=-1,R%NTMAX+3 + F%RN(JN) = REAL(JN,JPRD) + F%NLTN(JN) = R%NTMAX+2-JN +ENDDO +F%RLAPIN(:) = 0.0_JPRD +F%RLAPIN(0) = 0.0_JPRD +F%RLAPIN(-1) = 0.0_JPRD +DO JN=1,R%NSMAX+2 + F%RLAPIN(JN)=-(REAL(RA,JPRD)*REAL(RA,JPRD)/REAL(JN*(JN+1),JPRD)) +ENDDO + +! ------------------------------------------------------------------ +9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) + +END SUBROUTINE PRE_SULEG +END MODULE PRE_SULEG_MOD diff --git a/src/trans/gpu/internal/prepsnm_mod.F90 b/src/trans/gpu/internal/prepsnm_mod.F90 new file mode 100755 index 00000000..0f79edbf --- /dev/null +++ b/src/trans/gpu/internal/prepsnm_mod.F90 @@ -0,0 +1,105 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE PREPSNM_MOD + CONTAINS + SUBROUTINE PREPSNM + + + !**** *PREPSNM* - Prepare REPSNM for wavenumber KM + + ! Purpose. + ! -------- + ! Copy the REPSNM values for specific zonal wavenumber M + ! to work array + + !** Interface. + ! ---------- + ! CALL PREPSNM(...) + + ! Explicit arguments : KM - zonal wavenumber + ! ------------------- KMLOC - local zonal wavenumber + ! PEPSNM - REPSNM for zonal + ! wavenumber KM + + ! Implicit arguments : + ! -------------------- + + ! Method. + ! ------- + + + ! Reference. + ! ---------- + + + ! Author. + ! ------- + ! Lars Isaksen *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 00-02-01 From LTINV in IFS CY22R1 + + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + + USE TPM_DIM ,ONLY : R + USE TPM_FIELDS ,ONLY : F, ZEPSNM + USE TPM_DISTR ,ONLY : D + USE TPM_GEN ,ONLY : NOUT + ! + + IMPLICIT NONE + + INTEGER(KIND=JPIM) :: KM,KMLOC + !!REAL(KIND=JPRB), INTENT(INOUT) :: PEPSNM(:,:) + + ! LOCAL INTEGER SCALARS + INTEGER(KIND=JPIM) :: JN + INTEGER(KIND=JPIM) :: R_NTMAX + + + ! ------------------------------------------------------------------ + + !* 1. COPY REPSNM. + ! ------------ + + + + + !!!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO + !!!$ACC parallel loop + DO KMLOC=1,D%NUMP + KM = D%MYMS(KMLOC) + + IF (KM > 0) THEN +#ifdef ACCGPU + !$ACC loop +#endif + DO JN=0,KM-1 + ZEPSNM(KMLOC,JN) = 0.0_JPRBT + ENDDO + ENDIF + + DO JN=KM,R%NTMAX+2 + ZEPSNM(KMLOC,JN) = F%REPSNM(D%NPMT(KM)+KMLOC-KM+JN) + ENDDO + ! end loop over wavenumber + ENDDO + !!!!$OMP END TARGET DATA + !!!!$ACC end data + + ! ------------------------------------------------------------------ + + END SUBROUTINE PREPSNM + + END MODULE PREPSNM_MOD diff --git a/src/trans/gpu/internal/prfi1_mod.F90 b/src/trans/gpu/internal/prfi1_mod.F90 new file mode 100755 index 00000000..ca07a7f0 --- /dev/null +++ b/src/trans/gpu/internal/prfi1_mod.F90 @@ -0,0 +1,114 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE PRFI1_MOD +CONTAINS +SUBROUTINE PRFI1(KM,KF_UV,KF_SCALARS,PIA,PSPVOR,PSPDIV,PSPSCALAR,& + & KFLDPTRUV,KFLDPTRSC) + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +!USE TPM_DISTR +!USE TPM_TRANS + +USE PRFI1B_MOD ,ONLY : PRFI1B + + +!**** *PRFI1* - Prepare spectral fields for inverse Legendre transform + +! Purpose. +! -------- +! To extract the spectral fields for a specific zonal wavenumber +! and put them in an order suitable for the inverse Legendre . +! tranforms.The ordering is from NSMAX to KM for better conditioning. +! Elements 1,2 and NLCM(KM)+1 are zeroed in preparation for computing +! u,v and derivatives in spectral space. + +!** Interface. +! ---------- +! *CALL* *PRFI1(KM,PIA,PSPVOR,PSPDIV,PSPSCALAR + +! Explicit arguments : KM - zonal wavenumber +! ------------------ PIA - spectral components for transform +! PSPVOR - vorticity +! PSPDIV - divergence +! PSPSCALAR - scalar variables + +! Implicit arguments : None. +! -------------------- + +! Method. +! ------- + +! Externals. None. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! Mats Hamrud and Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 00-02-01 From PRFI1 in IFS CY22R1 + +! ------------------------------------------------------------------ + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(IN) :: KM +INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV +INTEGER(KIND=JPIM),INTENT(IN) :: KF_SCALARS +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPVOR(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPDIV(:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSCALAR(:,:) +REAL(KIND=JPRB) , INTENT(OUT) :: PIA(:,:) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) + + +! LOCAL INTEGER SCALARS +INTEGER(KIND=JPIM) :: IDIV, IFIRST, ILAST, IVOR + +stop 'Error: prfi1 not (yet) supported in GPU version' + +! ------------------------------------------------------------------ + +!* 1. EXTRACT FIELDS FROM SPECTRAL ARRAYS. +! ------------------------------------ + +! IFIRST = 1 +! ILAST = 4*KF_UV + +! !* 1.1 VORTICITY AND DIVERGENCE. + +! IF(KF_UV > 0)THEN +! IVOR = 1 +! IDIV = 2*KF_UV+1 +! CALL PRFI1B(KM,PIA(:,IVOR:IDIV-1),PSPVOR,KF_UV,KFLDPTRUV) +! CALL PRFI1B(KM,PIA(:,IDIV:ILAST) ,PSPDIV,KF_UV,KFLDPTRUV) +! ILAST = ILAST+4*KF_UV +! ENDIF + +! !* 1.2 SCALAR VARIABLES. + +! IF(KF_SCALARS > 0)THEN +! IFIRST = ILAST+1 +! ILAST = IFIRST - 1 + 2*KF_SCALARS +! CALL PRFI1B(KM,PIA(:,IFIRST:ILAST),PSPSCALAR(:,:),KF_SCALARS,KFLDPTRSC) +! ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE PRFI1 +END MODULE PRFI1_MOD diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 new file mode 100755 index 00000000..73098879 --- /dev/null +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -0,0 +1,196 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE PRFI1B_MOD + CONTAINS + SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) + + USE PARKIND1 ,ONLY : JPIM ,JPRB + + USE TPM_GEN ,ONLY : NOUT + USE TPM_DIM ,ONLY : R,R_NSMAX + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,D_NASM0 + USE IEEE_ARITHMETIC + + !**** *PRFI1* - Prepare spectral fields for inverse Legendre transform + + ! Purpose. + ! -------- + ! To extract the spectral fields for a specific zonal wavenumber + ! and put them in an order suitable for the inverse Legendre . + ! tranforms.The ordering is from NSMAX to KM for better conditioning. + ! Elements 1,2 and NLCM(KM)+1 are zeroed in preparation for computing + ! u,v and derivatives in spectral space. + + !** Interface. + ! ---------- + ! *CALL* *PRFI1B(...)* + + ! Explicit arguments : KM - zonal wavenumber + ! ------------------ PIA - spectral components for transform + ! PSPEC - spectral array + ! KFIELDS - number of fields + + + ! Implicit arguments : None. + ! -------------------- + + ! Method. + ! ------- + + ! Externals. None. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Mats Hamrud and Philippe Courtier *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 00-02-01 From PRFI1B in IFS CY22R1 + + ! ------------------------------------------------------------------ + + IMPLICIT NONE + + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS + INTEGER(KIND=JPIM) :: KM,KMLOC + REAL(KIND=JPRB) ,INTENT(IN) :: PSPEC(:,:) + REAL(KIND=JPRB) ,INTENT(INOUT) :: PIA(:,:,:) + INTEGER(KIND=JPIM),INTENT(IN) :: KDIM + INTEGER(KIND=JPIM),INTENT(IN),OPTIONAL :: KFLDPTR(:) + + ! LOCAL INTEGER SCALARS + INTEGER(KIND=JPIM) :: II, INM, IR, JN, JFLD, ILCM, IASM0, IFLD + + ! ------------------------------------------------------------------ + + !* 1. EXTRACT FIELDS FROM SPECTRAL ARRAYS. + ! -------------------------------------------------- + +#ifdef ACCGPU + !$ACC DATA & + !$ACC& PRESENT(D_NUMP,R_NSMAX,D_MYMS,D_NASM0) & + !$ACC& PRESENT(PIA) & + !$ACC& PRESENT(PSPEC) ASYNC(1) +#endif +#ifdef OMPGPU + !$OMP TARGET DATA MAP(PRESENT,ALLOC:D_NUMP,R_NSMAX,D_MYMS,D_NASM0,PSPEC) +#endif + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA IF(PRESENT(KFLDPTR)) PRESENT(KFLDPTR) ASYNC(1) +#endif + + + IF(PRESENT(KFLDPTR)) THEN + + PRINT *, "Not implemented" + STOP 4 + + !loop over wavenumber +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,ILCM,IFLD,IASM0,IR,II,INM) +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ILCM,IFLD,IASM0,IR,II,INM) & + !$ACC& FIRSTPRIVATE(KFIELDS) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JN=1,R_NSMAX+1 + DO JFLD=1,KFIELDS + KM = D_MYMS(KMLOC) + ILCM = R_NSMAX+1-KM + IFLD = KFLDPTR(JFLD) + IF (JN .LE. ILCM) THEN + IASM0 = D_NASM0(KM) + INM = IASM0+(ILCM-JN)*2 + IR = 2*(JFLD-1)+1 + II = IR+1 + PIA(IR,JN+2,KMLOC) = PSPEC(IFLD,INM ) + PIA(II,JN+2,KMLOC) = PSPEC(IFLD,INM+1) + END IF + ENDDO + ENDDO + + ! end loop over wavenumber + ENDDO + +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) PRIVATE(KM,ILCM) +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ILCM) & + !$ACC& FIRSTPRIVATE(KFIELDS) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JFLD=1,2*KFIELDS + KM = D_MYMS(KMLOC) + ILCM = R_NSMAX+1-KM + PIA(JFLD,1,KMLOC) = 0.0_JPRB + PIA(JFLD,2,KMLOC) = 0.0_JPRB + PIA(JFLD,ILCM+3,KMLOC) = 0.0_JPRB + ENDDO + ! end loop over wavenumber + ENDDO + + ELSE + + !loop over wavenumber + +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,ILCM,IOFF,INM,IR,II) +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IASM0,INM) & + !$ACC& FIRSTPRIVATE(KFIELDS,KDIM) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JN=0,R_NSMAX+3 + DO JFLD=1,KFIELDS + KM = D_MYMS(KMLOC) + + IF (JN <= 1) THEN + PIA(2*JFLD-1,JN+1,KMLOC) = 0.0_JPRB + PIA(2*JFLD ,JN+1,KMLOC) = 0.0_JPRB + ELSEIF (JN <= R_NSMAX+2-KM) THEN + IASM0 = D_NASM0(KM) + INM = IASM0+((R_NSMAX+2-JN)-KM)*2 + PIA(2*JFLD-1,JN+1,KMLOC) = PSPEC(JFLD,INM ) + PIA(2*JFLD ,JN+1,KMLOC) = PSPEC(JFLD,INM+1) + ELSEIF (JN <= R_NSMAX+3-KM) THEN + PIA(2*JFLD-1,JN+1,KMLOC) = 0.0_JPRB + PIA(2*JFLD ,JN+1,KMLOC) = 0.0_JPRB + ENDIF + ENDDO + ENDDO + ENDDO + +ENDIF + +#ifdef ACCGPU +!$ACC END DATA +!$ACC END DATA +#endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif + + ! ------------------------------------------------------------------ + + END SUBROUTINE PRFI1B +END MODULE PRFI1B_MOD diff --git a/src/trans/gpu/internal/read_legpol_mod.F90 b/src/trans/gpu/internal/read_legpol_mod.F90 new file mode 100755 index 00000000..7f145c6e --- /dev/null +++ b/src/trans/gpu/internal/read_legpol_mod.F90 @@ -0,0 +1,235 @@ +! (C) Copyright 2015- ECMWF. +! (C) Copyright 2015- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE READ_LEGPOL_MOD +CONTAINS +SUBROUTINE READ_LEGPOL +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT ,JPRD +USE TPM_GEN +USE TPM_DISTR +USE TPM_DIM +USE TPM_GEOMETRY +USE TPM_FLT +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE TPM_CTL +USE BYTES_IO_MOD +USE SHAREDMEM_MOD + +!**** *READ_LEGPOL * - read in Leg.Pol. and assocciated arrays from file or memory segment + +! Purpose. +! -------- +! + +!** Interface. +! ---------- +! *CALL* *READ_LEGPOL* + +! Explicit arguments : None +! -------------------- + +! Implicit arguments : +! -------------------- +! + +! Method. +! ------- +! See documentation + +! Externals. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS +! + +! ------- +! Mats Hamrud and Willem Deconinck *ECMWF* + +! Modifications. +! -------------- +! Original : July 2015 + +IMPLICIT NONE + +INTEGER(KIND=JPIM),PARAMETER :: JPIBUFL=4 +INTEGER(KIND=JPIM) :: IRBYTES,IIBYTES,JMLOC,IPRTRV,IMLOC,IM,ILA,ILS +INTEGER(KIND=JPIM) :: IDGLU,ISIZE,IBYTES,IRET,IFILE,JSETV,IDUM,JGL,II,IDGLU2 +INTEGER(KIND=JPIM),POINTER :: IBUF(:) +REAL(KIND=JPRBT) ,ALLOCATABLE :: ZBUF(:) +INTEGER(KIND=JPIM) ,POINTER :: IBUFA(:) +CHARACTER(LEN=8) :: CLABEL +CHARACTER(LEN=16) :: CLABEL_16 + +! ------------------------------------------------------------------ + +IRBYTES = 8 +IIBYTES = 4 +IDUM = 3141 + +IF(C%CIO_TYPE == 'file') THEN + CALL BYTES_IO_OPEN(IFILE,C%CLEGPOLFNAME,'R') + ALLOCATE(IBUF(JPIBUFL)) +ELSE + NULLIFY(IBUF) +ENDIF +IF(C%CIO_TYPE == 'file') THEN + CALL BYTES_IO_READ(IFILE,IBUF,JPIBUFL*IIBYTES,IRET) +ELSE + CALL SHAREDMEM_ASSOCIATE(C%STORAGE,JPIBUFL,IBUF,ADVANCE=.TRUE.) +ENDIF +CLABEL = TRANSFER(IBUF(1:2),CLABEL) +IF(CLABEL /= 'LEGPOL ') THEN + WRITE(NERR,*) CLABEL + CALL ABORT_TRANS('READ_LEGPOL:WRONG LABEL') +ENDIF +IF(IBUF(3) /= R%NSMAX) CALL ABORT_TRANS('READ_LEGPOL:WRONG SPECTRAL TRUNCATION') +IF(IBUF(4) /= R%NDGNH) CALL ABORT_TRANS('READ_LEGPOL:WRONG NO OF GAUSSIAN LATITUDES') +IF(C%CIO_TYPE == 'file') THEN + ALLOCATE(IBUFA(2*R%NDGNH)) + CALL BYTES_IO_READ(IFILE,IBUFA,2*R%NDGNH*IIBYTES,IRET) +ELSE + CALL SHAREDMEM_ASSOCIATE(C%STORAGE,2*R%NDGNH,IBUFA,ADVANCE=.TRUE.) +ENDIF +II = 0 +DO JGL=1,R%NDGNH + II = II+1 + IF(IBUFA(II) /= G%NLOEN(JGL)) THEN + WRITE(NERR,*) 'WRONG NUMBER OF LONGITUDE POINTS ', JGL,G%NLOEN(JGL),IBUFA(II) + CALL ABORT_TRANS('READ_LEGPOL:WRONG NLOEN') + ENDIF + II=II+1 + IF(IBUFA(II) /= G%NMEN(JGL)) THEN + WRITE(NERR,*) 'WRONG CUT-OFF WAVE NUMBER ', JGL,G%NMEN(JGL),IBUFA(II) + CALL ABORT_TRANS('READ_LEGPOL:WRONG NMEN') + ENDIF +ENDDO +IF(C%CIO_TYPE == 'file') THEN + DEALLOCATE(IBUFA) +ENDIF + +DO JMLOC=1,D%NUMP,NPRTRV ! +++++++++++++++++++++ JMLOC LOOP ++++++++++ + IPRTRV=MIN(NPRTRV,D%NUMP-JMLOC+1) + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ILA = (R%NSMAX-IM+2)/2 + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) +! Anti-symmetric + IF(C%CIO_TYPE == 'file') THEN + ISIZE = IDGLU*ILA + ALLOCATE(ZBUF(ISIZE)) + IBYTES = ISIZE*IRBYTES + CALL BYTES_IO_READ(IFILE,ZBUF,IBYTES,IRET) + IF(IRET < JPBYTES_IO_SUCCESS ) THEN + WRITE(NERR,*) 'BYTES_IO_READ ',IFILE,' ',IBYTES,' FAILED',IRET + CALL ABORT_TRANS('READ_LEGPOL:BYTES_IO_READ FAILED') + ENDIF + ALLOCATE(S%FA(IMLOC)%RPNMA(IDGLU,ILA)) + S%FA(IMLOC)%RPNMA(:,:) = RESHAPE(ZBUF,(/IDGLU,ILA/)) + DEALLOCATE(ZBUF) + ELSE + CALL SHAREDMEM_ASSOCIATE(C%STORAGE,IDGLU,ILA,S%FA(IMLOC)%RPNMA,ADVANCE=.TRUE.) + ENDIF +! Symmetric + IF(C%CIO_TYPE == 'file') THEN + ISIZE = IDGLU*ILS + IBYTES = ISIZE*IRBYTES + ALLOCATE(ZBUF(ISIZE)) + CALL BYTES_IO_READ(IFILE,ZBUF,IBYTES,IRET) + IF(IRET < JPBYTES_IO_SUCCESS ) THEN + WRITE(NERR,*) 'BYTES_IO_READ ',IFILE,' ',IBYTES,' FAILED',IRET + CALL ABORT_TRANS('READ_LEGPOL:BYTES_IO_READ FAILED') + ENDIF + ALLOCATE(S%FA(IMLOC)%RPNMS(IDGLU,ILS)) + S%FA(IMLOC)%RPNMS(:,:) = RESHAPE(ZBUF,(/IDGLU,ILS/)) + DEALLOCATE(ZBUF) + ELSE + CALL SHAREDMEM_ASSOCIATE(C%STORAGE,IDGLU,ILS,S%FA(IMLOC)%RPNMS,ADVANCE=.TRUE.) + ENDIF + ENDDO +ENDDO + +! Lat-lon grid +IF(S%LDLL) THEN + IF(C%CIO_TYPE == 'file') THEN + CALL BYTES_IO_READ(IFILE,IBUF,JPIBUFL*IIBYTES,IRET) + ELSE + CALL SHAREDMEM_ASSOCIATE(C%STORAGE,JPIBUFL,IBUF,ADVANCE=.TRUE.) + ENDIF + CLABEL_16 = TRANSFER(IBUF,CLABEL_16) + IF(CLABEL_16 /= 'LATLON---BEG-BEG')CALL ABORT_TRANS('READ_LEGPOL:WRONG LAT/LON LABEL') + + DO JMLOC=1,D%NUMP + IM = D%MYMS(JMLOC) + ILA = (R%NSMAX-IM+2)/2 + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + IDGLU2 = S%NDGNHD + + IF(C%CIO_TYPE == 'file') THEN + CALL BYTES_IO_READ(IFILE,IBUF,JPIBUFL*IIBYTES,IRET) + ELSE + CALL SHAREDMEM_ASSOCIATE(C%STORAGE,JPIBUFL,IBUF,ADVANCE=.TRUE.) + ENDIF + IF(IBUF(1) /= IM .OR. IBUF(2) /= IDGLU .OR. IBUF(3) /= IDGLU2 ) THEN + WRITE(NERR,*) 'READ_LEGPOL ERROR ', IBUF,IM,IDGLU,IDGLU2 + CALL ABORT_TRANS('READ_LEGPOL:WRONG LAT-LON MATRIX SIZE') + ENDIF + + IF(C%CIO_TYPE == 'file') THEN + + ISIZE = 2*IDGLU*2 + IBYTES = ISIZE*IRBYTES + ALLOCATE(ZBUF(ISIZE)) + CALL BYTES_IO_READ(IFILE,ZBUF,IBYTES,IRET) + IF(IRET < JPBYTES_IO_SUCCESS ) THEN + WRITE(NERR,*) 'BYTES_IO_READ ',IFILE,' ',IBYTES,' FAILED',IRET + CALL ABORT_TRANS('READ_LEGPOL:BYTES_IO_READ FAILED') + ENDIF + ALLOCATE(S%FA(JMLOC)%RPNMWI(2*IDGLU,2)) + S%FA(JMLOC)%RPNMWI(:,:) = RESHAPE(ZBUF,(/2*IDGLU,2/)) + DEALLOCATE(ZBUF) + + ISIZE = 2*IDGLU2*2 + IBYTES = ISIZE*IRBYTES + ALLOCATE(ZBUF(ISIZE)) + CALL BYTES_IO_READ(IFILE,ZBUF,IBYTES,IRET) + IF(IRET < JPBYTES_IO_SUCCESS ) THEN + WRITE(NERR,*) 'BYTES_IO_READ ',IFILE,' ',IBYTES,' FAILED',IRET + CALL ABORT_TRANS('READ_LEGPOL:BYTES_IO_READ FAILED') + ENDIF + ALLOCATE(S%FA(JMLOC)%RPNMWO(2*IDGLU2,2)) + S%FA(JMLOC)%RPNMWO(:,:) = RESHAPE(ZBUF,(/2*IDGLU2,2/)) + DEALLOCATE(ZBUF) + + ELSE + CALL SHAREDMEM_ASSOCIATE(C%STORAGE,2*IDGLU,2,S%FA(JMLOC)%RPNMWI,ADVANCE=.TRUE.) + CALL SHAREDMEM_ASSOCIATE(C%STORAGE,2*IDGLU2,2,S%FA(JMLOC)%RPNMWO,ADVANCE=.TRUE.) + ENDIF + ENDDO +ENDIF + +IF(C%CIO_TYPE == 'file') THEN + CALL BYTES_IO_READ(IFILE,IBUF,JPIBUFL*IIBYTES,IRET) +ELSE + CALL SHAREDMEM_ASSOCIATE(C%STORAGE,JPIBUFL,IBUF,ADVANCE=.TRUE.) +ENDIF +CLABEL_16 = TRANSFER(IBUF,CLABEL_16) +IF(CLABEL_16 /= 'LEGPOL---EOF-EOF')CALL ABORT_TRANS('READ_LEGPOL:WRONG END LABEL') +IF(C%CIO_TYPE == 'file') THEN + CALL BYTES_IO_CLOSE(IFILE) + DEALLOCATE(IBUF) +ENDIF + +END SUBROUTINE READ_LEGPOL +END MODULE READ_LEGPOL_MOD diff --git a/src/trans/gpu/internal/set2pe_mod.F90 b/src/trans/gpu/internal/set2pe_mod.F90 new file mode 100755 index 00000000..c7f69d31 --- /dev/null +++ b/src/trans/gpu/internal/set2pe_mod.F90 @@ -0,0 +1,131 @@ +! (C) Copyright 1998- ECMWF. +! (C) Copyright 1998- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SET2PE_MOD +CONTAINS +SUBROUTINE SET2PE(KPE,KPRGPNS,KPRGPEW,KPRTRW,KPRTRV) + + +!**** *SET2PE* - Convert from set numbers to PE number + +! Purpose. +! -------- +! Convert from set numbers in either grid-point space or spectral space +! to PE number + +!** Interface. +! ---------- +! *CALL* *SET2PE(KPRGPNS,KPRGPEW,KPRTRW,KPRTRV,KPE) + +! Explicit arguments : +! -------------------- + +! input : KPRGPNS - integer A set number in grid space +! in the range 1 .. NPRGPNS +! KPRGPEW - integer B set number in grid space +! in the range 1 .. NPRGPEW +! KPRTRW - integer A set number in spectral space +! in the range 1 .. NPRTRW +! KPRTRV - integer B set number in spectral space +! in the range 1 .. NPRTRV +! output: KPE - integer processor number +! in the range 1 .. NPROC + +! Normally, one pair of input set numbers will be set to zero +! SET2PE will compute KPE from the first pair if they are valid numbers. +! else from the other pair, + +! Implicit arguments : YOMMP parameters +! NPRGPNS,NPRGPEW,NPRTRW,NPRTRV,NPROC + +! -------------------- +! Method. +! ------- + +! Externals. +! ---------- +! NONE + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! David Dent *ECMWF* + +! Modifications. +! -------------- +! Original : 98-08-19 +! ------------------------------------------------------------------ + + +USE PARKIND1 ,ONLY : JPIM + +USE TPM_DISTR ,ONLY : LEQ_REGIONS, NPRGPEW, NPRGPNS, NPRTRV, NPRTRW +USE EQ_REGIONS_MOD ,ONLY : N_REGIONS, N_REGIONS_NS +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +! + +IMPLICIT NONE +INTEGER(KIND=JPIM),INTENT(IN) :: KPRGPNS,KPRGPEW,KPRTRW,KPRTRV +INTEGER(KIND=JPIM),INTENT(OUT) :: KPE + +INTEGER(KIND=JPIM) :: IPE,JA +! ------------------------------------------------------------------ + +!* 1. Choose from input parameters +! ---------------------------- + +IF(KPRGPNS > 0.AND.KPRGPEW > 0) THEN + + IF( LEQ_REGIONS )THEN + IF( KPRGPNS > N_REGIONS_NS )THEN + WRITE(*,'(A,2I8)') ' SET2PE INVALID ARGUMENT ',KPRGPNS,N_REGIONS_NS + CALL ABOR1(' SET2PE INVALID ARGUMENT ') + ENDIF + IF( KPRGPEW > N_REGIONS(KPRGPNS) )THEN + WRITE(*,'(A,2I8)') ' SET2PE INVALID ARGUMENT ',KPRGPEW,N_REGIONS(KPRGPNS) + CALL ABOR1(' SET2PE INVALID ARGUMENT ') + ENDIF + KPE=0 + DO JA=1,KPRGPNS-1 + KPE=KPE+N_REGIONS(JA) + ENDDO + KPE=KPE+KPRGPEW + ELSE + IF(KPRGPNS <= NPRGPNS.AND.KPRGPEW <= NPRGPEW) THEN + +!* 2. Grid-space set values supplied +! ------------------------------ + + KPE=(KPRGPNS-1)*NPRGPEW + KPRGPEW + ELSE + WRITE(*,'(A,2I8)') ' SET2PE INVALID ARGUMENT ',KPRGPNS,KPRGPEW + CALL ABORT_TRANS(' SET2PE INVALID ARGUMENT ') + ENDIF + ENDIF + +ELSE + +!* 3. Spectral space set values supplied +! ---------------------------------- + + IF(KPRTRW <= NPRTRW.AND.KPRTRV <= NPRTRV) THEN + KPE=(KPRTRW-1)*NPRTRV + KPRTRV + ELSE + WRITE(*,'(A,2I8)') ' SET2PE INVALID ARGUMENT ',KPRTRW,KPRTRV + CALL ABORT_TRANS(' SET2PE INVALID ARGUMENT ') + ENDIF + +ENDIF + +END SUBROUTINE SET2PE +END MODULE SET2PE_MOD diff --git a/src/trans/gpu/internal/set_resol_mod.F90 b/src/trans/gpu/internal/set_resol_mod.F90 new file mode 100755 index 00000000..9fb18ac6 --- /dev/null +++ b/src/trans/gpu/internal/set_resol_mod.F90 @@ -0,0 +1,73 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SET_RESOL_MOD +CONTAINS +SUBROUTINE SET_RESOL(KRESOL,LDSETUP) +USE PARKIND1 ,ONLY : JPIM + +USE TPM_GEN ,ONLY : NOUT, MSETUP0, NCUR_RESOL, NMAX_RESOL,LENABLED +USE TPM_DIM ,ONLY : R, DIM_RESOL +!USE TPM_TRANS +USE TPM_DISTR ,ONLY : D, DISTR_RESOL +USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL +USE TPM_FIELDS ,ONLY : F, FIELDS_RESOL +USE TPM_FFT ,ONLY : T, FFT_RESOL +USE TPM_HICFFT ,ONLY : HICT, HICFFT_RESOL +USE TPM_FLT +USE TPM_CTL ,ONLY : C, CTL_RESOL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +! + +IMPLICIT NONE + +! Declaration of arguments + +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL +LOGICAL ,OPTIONAL, INTENT(IN) :: LDSETUP + +! Local varaibles +INTEGER(KIND=JPIM) :: IRESOL +LOGICAL :: LLSETUP + +! ------------------------------------------------------------------ + +IF(MSETUP0 == 0) CALL ABORT_TRANS('SET_RESOL:TRANS NOT SETUP') +LLSETUP = .FALSE. +IF(PRESENT(LDSETUP)) LLSETUP = LDSETUP +IRESOL = 1 +IF(PRESENT(KRESOL)) THEN + IRESOL = KRESOL + IF(IRESOL < 1 .OR. IRESOL > NMAX_RESOL) THEN + WRITE(NOUT,*)'SET_RESOL: UNKNOWN RESOLUTION ',IRESOL,NMAX_RESOL + CALL ABORT_TRANS('SET_RESOL:IRESOL < 1 .OR. KRESOL > NMAX_RESOL') + ENDIF + IF(.NOT.LLSETUP) THEN + IF(.NOT.LENABLED(IRESOL)) THEN + WRITE(NOUT,*)'SET_RESOL: UNKNOWN RESOLUTION ',IRESOL,LENABLED + CALL ABORT_TRANS('SET_RESOL:IRESOL NOT ENABLED') + ENDIF + ENDIF +ENDIF +IF(IRESOL /= NCUR_RESOL) THEN + NCUR_RESOL = IRESOL + R => DIM_RESOL(NCUR_RESOL) + F => FIELDS_RESOL(NCUR_RESOL) + G => GEOM_RESOL(NCUR_RESOL) + D => DISTR_RESOL(NCUR_RESOL) + T => FFT_RESOL(NCUR_RESOL) + HICT => HICFFT_RESOL(NCUR_RESOL) + S => FLT_RESOL(NCUR_RESOL) + C => CTL_RESOL(NCUR_RESOL) +ENDIF + +END SUBROUTINE SET_RESOL +END MODULE SET_RESOL_MOD diff --git a/src/trans/gpu/internal/setup_dims_mod.F90 b/src/trans/gpu/internal/setup_dims_mod.F90 new file mode 100755 index 00000000..c0277d3d --- /dev/null +++ b/src/trans/gpu/internal/setup_dims_mod.F90 @@ -0,0 +1,50 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SETUP_DIMS_MOD +CONTAINS +SUBROUTINE SETUP_DIMS + +USE PARKIND1 ,ONLY : JPIM + +USE TPM_DIM ,ONLY : R +USE TPM_FLT ,ONLY : S +! + +IMPLICIT NONE + +INTEGER(KIND=JPIM) :: JM,JN,ISPOLEG + +! ------------------------------------------------------------------ + +ISPOLEG = 0 +DO JM=0,R%NSMAX + DO JN=JM,R%NTMAX+1 + ISPOLEG = ISPOLEG+1 + ENDDO +ENDDO +R%NSPOLEG = ISPOLEG + +R%NSPEC_G = (R%NSMAX+1)*(R%NSMAX+2)/2 +R%NSPEC2_G = R%NSPEC_G*2 + +R%NDGNH = (R%NDGL+1)/2 + +R%NLEI1 = R%NSMAX+4+MOD(R%NSMAX+4+1,2) +R%NLEI3 = R%NDGNH+MOD(R%NDGNH+2,2) +IF (S%LSOUTHPNM) R%NLEI3=2*R%NLEI3 + +R%NLED3 = R%NTMAX+2+MOD(R%NTMAX+3,2) +R%NLED4 = R%NTMAX+3+MOD(R%NTMAX+4,2) + +! ------------------------------------------------------------------ + +END SUBROUTINE SETUP_DIMS +END MODULE SETUP_DIMS_MOD diff --git a/src/trans/gpu/internal/setup_geom_mod.F90 b/src/trans/gpu/internal/setup_geom_mod.F90 new file mode 100755 index 00000000..9c688302 --- /dev/null +++ b/src/trans/gpu/internal/setup_geom_mod.F90 @@ -0,0 +1,110 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SETUP_GEOM_MOD +CONTAINS +SUBROUTINE SETUP_GEOM + +USE PARKIND1 ,ONLY : JPRD, JPIM + +USE TPM_GEN ,ONLY : NOUT, NPRINTLEV +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D +USE TPM_FIELDS ,ONLY : F +USE TPM_GEOMETRY ,ONLY : G +! + +IMPLICIT NONE + +REAL(KIND=JPRD) :: ZSQM2(R%NDGL) +INTEGER(KIND=JPIM) :: IDGLU(0:R%NSMAX,R%NDGNH) +INTEGER(KIND=JPIM) :: JGL,JM,NSMAXLIN + +LOGICAL :: LLP1,LLP2 + +! ------------------------------------------------------------------ + +IF(.NOT.D%LGRIDONLY) THEN + + LLP1 = NPRINTLEV>0 + LLP2 = NPRINTLEV>1 + + IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SETUP_GEOM ===' + + ALLOCATE (G%NMEN(R%NDGL)) + IF(LLP2)WRITE(NOUT,9) 'G%NMEN ',SIZE(G%NMEN ),SHAPE(G%NMEN ) + + NSMAXLIN = R%NDGL-1 + IF (R%NSMAX>=NSMAXLIN .OR. .NOT. G%LREDUCED_GRID) THEN + ! linear or full grid + DO JGL=1,R%NDGL + G%NMEN(JGL) = MIN(R%NSMAX,(G%NLOEN(JGL)-1)/2) + ENDDO + ELSEIF (R%NSMAX>=R%NDGL*2/3-1) THEN + ! quadratic grid + ZSQM2(:) = 3*(NSMAXLIN-R%NSMAX)/R%NDGL*F%R1MU2(:) + G%NMEN(1) = MIN(R%NSMAX,INT(REAL(G%NLOEN(1)-1,JPRD)/(2.0_JPRD+ZSQM2(1)))) + DO JGL=2,R%NDGNH + G%NMEN(JGL) = MIN(R%NSMAX,MAX(G%NMEN(JGL-1),& + &INT(REAL(G%NLOEN(JGL)-1,JPRD)/(2.0_JPRD+ ZSQM2(JGL))))) + ENDDO + ! * SOUTHERN HEMISPHERE : + G%NMEN(R%NDGL) = MIN(R%NSMAX,INT(REAL(G%NLOEN(R%NDGL)-1,JPRD)/(2.0_JPRD+ZSQM2(R%NDGL)))) + DO JGL=R%NDGL-1, R%NDGNH+1, -1 + G%NMEN(JGL) = MIN(R%NSMAX,MAX(G%NMEN(JGL+1),& + &INT(REAL(G%NLOEN(JGL)-1,JPRD)/(2.0_JPRD+ ZSQM2(JGL))))) + ENDDO + ELSE + ! cubic grid + ZSQM2(:) = F%R1MU2(:) + G%NMEN(1) = MIN(R%NSMAX,INT(REAL(G%NLOEN(1)-1,JPRD)/(2.0_JPRD+ZSQM2(1)))-1) + DO JGL=2,R%NDGNH + G%NMEN(JGL) = MIN(R%NSMAX,MAX(G%NMEN(JGL-1),& + &INT(REAL(G%NLOEN(JGL)-1,JPRD)/(2.0_JPRD+ ZSQM2(JGL)))-1)) + ENDDO + ! * SOUTHERN HEMISPHERE : + G%NMEN(R%NDGL) = MIN(R%NSMAX,INT(REAL(G%NLOEN(R%NDGL)-1,JPRD)/(2.0_JPRD+ZSQM2(R%NDGL)))-1) + DO JGL=R%NDGL-1, R%NDGNH+1, -1 + G%NMEN(JGL) = MIN(R%NSMAX,MAX(G%NMEN(JGL+1),& + &INT(REAL(G%NLOEN(JGL)-1,JPRD)/(2.0_JPRD+ ZSQM2(JGL)))-1)) + ENDDO + ENDIF + IF(LLP1) THEN + WRITE(NOUT,FMT='('' (JGL,G%NLOEN,G%NMEN) '')') + WRITE(NOUT,FMT='(8(1X,''('',I4,I4,I4,'')''))')& + &(JGL,G%NLOEN(JGL),G%NMEN(JGL),JGL=1,R%NDGL) + ENDIF + ALLOCATE(G%NDGLU(0:R%NSMAX)) + IF(LLP2)WRITE(NOUT,9) 'G%NDGLU ',SIZE(G%NDGLU ),SHAPE(G%NDGLU ) + IDGLU(:,:) = 0 + G%NDGLU(:) = 0 + DO JGL=1,R%NDGNH + DO JM=0,G%NMEN(JGL) + IDGLU(JM,JGL) = 1 + ENDDO + ENDDO + DO JM=0,R%NSMAX + DO JGL=1,R%NDGNH + G%NDGLU(JM) = G%NDGLU(JM)+IDGLU(JM,JGL) + ENDDO + ENDDO + IF(LLP1) THEN + WRITE(NOUT,FMT='('' (JM,G%NDGLU) '')') + WRITE(NOUT,FMT='(10(1X,''('',I4,I4,'')''))')& + &(JM,G%NDGLU(JM),JM=0,R%NSMAX) + ENDIF + +ENDIF + +! ------------------------------------------------------------------ +9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) + +END SUBROUTINE SETUP_GEOM +END MODULE SETUP_GEOM_MOD diff --git a/src/trans/gpu/internal/shuffle_mod.F90 b/src/trans/gpu/internal/shuffle_mod.F90 new file mode 100755 index 00000000..5cfd1738 --- /dev/null +++ b/src/trans/gpu/internal/shuffle_mod.F90 @@ -0,0 +1,137 @@ +! (C) Copyright 2001- ECMWF. +! (C) Copyright 2001- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SHUFFLE_MOD +CONTAINS +SUBROUTINE SHUFFLE(KF_UV_G,KF_SCALARS_G,KSHFUV_G,KIVSETUV,KSHFSC_G,KIVSETSC,& + & KVSETUV,KVSETSC) + +!**** *SHUFFLE* - Re-shuffle fields for load balancing + +! Purpose. +! -------- +! Re-shuffle fields for load balancing if NPRTRV>1. Note that the +! relative order of the local spectral fields has to maintained. + +!** Interface. +! ---------- +! CALL SHUFFLE(...) + +! Explicit arguments : +! -------------------- +! KF_UV_G - global number of spectral u-v fields +! KF_SCALARS_G - global number of scalar spectral fields +! KSHFUV_G - reshuffling index for uv fields +! KIVSETUV - reshuffled KVSETUV +! KSHFSC_G - reshuffling index for scalar fields +! KIVSETSC - reshuffled KVSETSC +! KVSETUV(:) - indicating which 'b-set' in spectral space owns a +! vor/div field. Equivalant to NBSETLEV in the IFS. +! The length of KVSETUV should be the GLOBAL number +! of u/v fields which is the dimension of u and v releated +! fields in grid-point space. +! KVESETSC(:) - indicating which 'b-set' in spectral space owns a +! scalar field. As for KVSETUV this argument is required +! if the total number of processors is greater than +! the number of processors used for distribution in +! spectral wave space. + +! Externals. NONE +! ---------- + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 01-01-03 + +! ------------------------------------------------------------------ +USE PARKIND1 ,ONLY : JPIM + +!USE TPM_GEN +!USE TPM_TRANS +USE TPM_DISTR ,ONLY : NPRTRV +! + +IMPLICIT NONE + +! Declaration of arguments + +INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV_G,KF_SCALARS_G +INTEGER(KIND=JPIM), INTENT(OUT) :: KSHFUV_G(:),KSHFSC_G(:) +INTEGER(KIND=JPIM), INTENT(OUT) :: KIVSETUV(:),KIVSETSC(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) + +INTEGER(KIND=JPIM) :: IHELP(MAX(KF_UV_G,KF_SCALARS_G),NPRTRV),IHELPC(NPRTRV) +INTEGER(KIND=JPIM) :: IDW,J + +! ------------------------------------------------------------------ + +IF(NPRTRV > 1) THEN + IHELP(:,:) = 0 + IHELPC(:) = 0 + DO J=1,KF_UV_G + IHELPC(KVSETUV(J)) = IHELPC(KVSETUV(J))+1 + IHELP(IHELPC(KVSETUV(J)),KVSETUV(J)) = J + ENDDO + IDW = KF_UV_G+1 + DO + DO J=NPRTRV,1,-1 + IF(IHELPC(J) > 0) THEN + IDW = IDW-1 + KSHFUV_G(IDW) = IHELP(IHELPC(J),J) + IHELPC(J) =IHELPC(J)-1 + ENDIF + ENDDO + IF(IDW == 1) EXIT + ENDDO + + IHELP(:,:) = 0 + IHELPC(:) = 0 + DO J=1,KF_SCALARS_G + IHELPC(KVSETSC(J)) = IHELPC(KVSETSC(J))+1 + IHELP(IHELPC(KVSETSC(J)),KVSETSC(J)) = J + ENDDO + IDW = KF_SCALARS_G+1 + DO + DO J=NPRTRV,1,-1 + IF(IHELPC(J) > 0) THEN + IDW = IDW-1 + KSHFSC_G(IDW) = IHELP(IHELPC(J),J) + IHELPC(J) =IHELPC(J)-1 + ENDIF + ENDDO + IF(IDW == 1) EXIT + ENDDO + + DO J=1,KF_UV_G + KIVSETUV(J) = KVSETUV(KSHFUV_G(J)) + ENDDO + DO J=1,KF_SCALARS_G + KIVSETSC(J) = KVSETSC(KSHFSC_G(J)) + ENDDO +ELSE + DO J=1,KF_UV_G + KSHFUV_G(J) = J + KIVSETUV(J) = 1 + ENDDO + DO J=1,KF_SCALARS_G + KSHFSC_G(J) = J + KIVSETSC(J) = 1 + ENDDO +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE SHUFFLE +END MODULE SHUFFLE_MOD diff --git a/src/trans/gpu/internal/spnorm_ctl_mod.F90 b/src/trans/gpu/internal/spnorm_ctl_mod.F90 new file mode 100755 index 00000000..6d102559 --- /dev/null +++ b/src/trans/gpu/internal/spnorm_ctl_mod.F90 @@ -0,0 +1,62 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SPNORM_CTL_MOD +CONTAINS +SUBROUTINE SPNORM_CTL(PNORM,PSPEC,KFLD,KFLD_G,KVSET,KMASTER,PMET) + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT + +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D, MYPROC, MYSETV + +USE SPNORMD_MOD ,ONLY : SPNORMD +USE SPNORMC_MOD ,ONLY : SPNORMC +! + +IMPLICIT NONE + +REAL(KIND=JPRB) , INTENT(OUT) :: PNORM(:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPEC(:,:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSET(:) +INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KMASTER +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PMET(:) +INTEGER(KIND=JPIM) , INTENT(IN) :: KFLD,KFLD_G +INTEGER(KIND=JPIM) :: IVSET(KFLD_G) +REAL(KIND=JPRBT) :: ZMET(0:R%NSMAX) +REAL(KIND=JPRBT) :: ZSM(KFLD,D%NUMP) +REAL(KIND=JPRBT) :: ZGM(KFLD_G,0:R%NSMAX) + +! ------------------------------------------------------------------ + +IF(PRESENT(KVSET)) THEN + IVSET(:) = KVSET(:) +ELSE + IVSET(:) = MYSETV +ENDIF + +IF(PRESENT(PMET)) THEN + ZMET(:) = PMET(:) +ELSE + ZMET(:) = 1.0_JPRBT +ENDIF + +CALL SPNORMD(PSPEC,KFLD,ZMET,ZSM) + +CALL SPNORMC(ZSM,KFLD_G,IVSET,KMASTER,R%NSMAX,ZGM) + +IF(MYPROC == KMASTER) THEN + PNORM(1:KFLD_G) = REAL(SUM(ZGM,DIM=2), KIND=JPRB) + PNORM(1:KFLD_G) = SQRT(PNORM(1:KFLD_G)) +ENDIF +! ------------------------------------------------------------------ + +END SUBROUTINE SPNORM_CTL +END MODULE SPNORM_CTL_MOD diff --git a/src/trans/gpu/internal/spnormc_mod.F90 b/src/trans/gpu/internal/spnormc_mod.F90 new file mode 100755 index 00000000..337685cd --- /dev/null +++ b/src/trans/gpu/internal/spnormc_mod.F90 @@ -0,0 +1,89 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SPNORMC_MOD +CONTAINS +SUBROUTINE SPNORMC(PSM,KFLD_G,KVSET,KMASTER,KSMAX,PGM) + + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT +USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_BARRIER + +USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRV, MYPROC, NPROC + +USE PE2SET_MOD ,ONLY : PE2SET + +IMPLICIT NONE + +REAL(KIND=JPRBT) ,INTENT(IN) :: PSM(:,:) +INTEGER(KIND=JPIM) ,INTENT(IN) :: KFLD_G +INTEGER(KIND=JPIM) ,INTENT(IN) :: KVSET(:) +INTEGER(KIND=JPIM) ,INTENT(IN) :: KMASTER +INTEGER(KIND=JPIM) ,INTENT(IN) :: KSMAX +REAL(KIND=JPRBT) ,INTENT(OUT) :: PGM(KFLD_G,0:KSMAX) + +REAL(KIND=JPRBT) :: ZRECVBUF(SIZE(PGM)) +INTEGER(KIND=JPIM) :: IFLDR(NPRTRV) + +INTEGER(KIND=JPIM) :: ISTOTAL,JFLD,ITAG,JROC,IMSGLEN,IRECVID +INTEGER(KIND=JPIM) :: IRECVNUMP,IRECVFLD,IFLD,JMLOC,IM,IBUFLENR,IA,IB +INTEGER(KIND=JPIM) :: IRECVSETA,IRECVSETB +! ------------------------------------------------------------------ + +ISTOTAL = SIZE(PSM) +IBUFLENR = SIZE(ZRECVBUF) + +IFLDR(:) = 0 +DO JFLD=1,KFLD_G + IFLDR(KVSET(JFLD)) = IFLDR(KVSET(JFLD))+1 +ENDDO +ITAG = 100 + +IF (NPROC > 1.AND.MYPROC /= KMASTER) THEN + CALL MPL_SEND(PSM(:,:),KDEST=NPRCIDS(KMASTER),KTAG=ITAG,& + &CDSTRING='SPNORMC:') +ENDIF + +IF (MYPROC == KMASTER) THEN + DO JROC=1,NPROC + IF (JROC == KMASTER) THEN + ZRECVBUF(1:ISTOTAL) = RESHAPE(PSM,SHAPE(ZRECVBUF(1:ISTOTAL))) + IRECVID = MYPROC + IMSGLEN = ISTOTAL + ELSE + CALL MPL_RECV(ZRECVBUF(1:IBUFLENR),KTAG=ITAG,& + &KFROM=IRECVID,CDSTRING='SPNORMC :') + ENDIF + CALL PE2SET(IRECVID,IA,IB,IRECVSETA,IRECVSETB) + IRECVNUMP = D%NUMPP(IRECVSETA) + IRECVFLD = IFLDR(IRECVSETB) + IFLD = 0 + DO JFLD=1,KFLD_G + IF(KVSET(JFLD) == IRECVSETB) THEN + IFLD=IFLD+1 + DO JMLOC=1,IRECVNUMP + IM = D%NALLMS(D%NPTRMS(IRECVSETA)-1+JMLOC) + PGM(JFLD,IM) = ZRECVBUF((JMLOC-1)*IRECVFLD+IFLD) + ENDDO + ENDIF + ENDDO + ENDDO +ENDIF + +! Perform barrier synchronisation to guarantee all processors have +! completed communication + +IF( NPROC > 1 )THEN + CALL MPL_BARRIER(CDSTRING='SPNORMC') +ENDIF +! ------------------------------------------------------------------ + +END SUBROUTINE SPNORMC +END MODULE SPNORMC_MOD diff --git a/src/trans/gpu/internal/spnormd_mod.F90 b/src/trans/gpu/internal/spnormd_mod.F90 new file mode 100755 index 00000000..77aafd3f --- /dev/null +++ b/src/trans/gpu/internal/spnormd_mod.F90 @@ -0,0 +1,66 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SPNORMD_MOD +CONTAINS +SUBROUTINE SPNORMD(PSPEC,KFLD,PMET,PSM) + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT + +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D +! + +IMPLICIT NONE + +REAL(KIND=JPRB) ,INTENT(IN) :: PSPEC(:,:) +REAL(KIND=JPRBT) ,INTENT(IN) :: PMET(0:R%NSMAX) +INTEGER(KIND=JPIM) ,INTENT(IN) :: KFLD +REAL(KIND=JPRBT) ,INTENT(OUT) :: PSM(:,:) + +INTEGER(KIND=JPIM) :: JM ,JFLD ,JN ,IM ,ISP + +! ------------------------------------------------------------------ + + +CALL GSTATS(1651,0) +!$OMP PARALLEL DO SCHEDULE(STATIC,1) PRIVATE(JM,IM,JN,ISP,JFLD) +DO JM=1,D%NUMP + PSM(:,JM) = 0.0_JPRBT + IM = D%MYMS(JM) + IF(IM == 0)THEN + DO JN=0,R%NSMAX + ISP = D%NASM0(0)+JN*2 + DO JFLD=1,KFLD + PSM(JFLD,JM) = PSM(JFLD,JM)+PMET(JN)*PSPEC(JFLD,ISP)**2 + ENDDO + ENDDO + ELSE + DO JN=IM,R%NSMAX + ISP = D%NASM0(IM)+(JN-IM)*2 + DO JFLD=1,KFLD + PSM(JFLD,JM) = PSM(JFLD,JM)+2.0_JPRBT*PMET(JN)*& + &(PSPEC(JFLD,ISP)**2+PSPEC(JFLD,ISP+1)**2) + ENDDO + ENDDO + ENDIF +ENDDO +!$OMP END PARALLEL DO +CALL GSTATS(1651,1) + +! ------------------------------------------------------------------ + +END SUBROUTINE SPNORMD +END MODULE SPNORMD_MOD + + + + + diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 new file mode 100755 index 00000000..5354728a --- /dev/null +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -0,0 +1,151 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SPNSDE_MOD +CONTAINS +SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT + +USE TPM_GEN ,ONLY : NOUT +USE TPM_DIM ,ONLY : R, R_NTMAX +USE TPM_DISTR ,ONLY : D, D_MYMS, D_NUMP +USE TPM_FIELDS ,ONLY : ZEPSNM +!USE TPM_TRANS + + +!**** *SPNSDE* - Compute North-South derivative in spectral space + +! Purpose. +! -------- +! In Laplace space compute the the North-south derivative + +!** Interface. +! ---------- +! CALL SPNSDE(...) + +! Explicit arguments : +! -------------------- +! KM -zonal wavenumber (input-c) +! PEPSNM - REPSNM for wavenumber KM (input-c) +! PF (NLEI1,2*KF_SCALARS) - input field (input) +! PNSD(NLEI1,2*KF_SCALARS) - N-S derivative (output) + +! Organisation within NLEI1: +! NLEI1 = NSMAX+4+mod(NSMAX+4+1,2) +! overdimensioning +! 1 : n=NSMAX+2 +! 2 : n=NSMAX+1 +! 3 : n=NSMAX +! . : +! . : +! NSMAX+3 : n=0 +! NSMAX+4 : n=-1 + +! Implicit arguments : YOMLAP +! -------------------- + +! Method. +! ------- + +! Externals. None. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS +! Temperton, 1991, MWR 119 p1303 + +! Author. +! ------- +! Mats Hamrud and Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 00-02-01 From SPNSDE in IFS CY22R1 + +! ------------------------------------------------------------------ + +IMPLICIT NONE + +INTEGER(KIND=JPIM) :: KM, KMLOC +INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS +!REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(0:R%NTMAX+2) +REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(1:D%NUMP,0:R%NTMAX+2) +REAL(KIND=JPRB), INTENT(IN) :: PF(:,:,:) +REAL(KIND=JPRB), INTENT(OUT) :: PNSD(:,:,:) + +! LOCAL INTEGER SCALARS +INTEGER(KIND=JPIM) :: IJ, ISKIP, J, JN, JI, IR, II + +#ifdef ACCGPU +!$ACC DATA & +!$ACC& PRESENT (R_NTMAX, D_MYMS) & +!$ACC& PRESENT (D_NUMP,PEPSNM, PF, PNSD) ASYNC(1) +#endif +#ifdef OMPGPU +!$OMP TARGET DATA & +!$OMP& MAP(PRESENT,ALLOC:ZN) +#endif + +! ------------------------------------------------------------------ + +!* 1. COMPUTE NORTH SOUTH DERIVATIVE. +! ------------------------------- + + +!* 1.1 COMPUTE + +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO + !! DEFAULT(NONE) PRIVATE(IJ) & + !!$OMP& SHARED(KM,F,ZN,ZEPSNM,KMLOC) +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IR,II,JI) & + !$ACC& FIRSTPRIVATE(KMLOC,KF_SCALARS) ASYNC(1) +#endif +DO KMLOC=1,D_NUMP + DO JN=0,R_NTMAX+1 + DO J=1,KF_SCALARS + IR = 2*J-1 + II = IR+1 + KM = D_MYMS(KMLOC) + + IF(KM /= 0 .AND. JN >= KM) THEN + ! (DO JN=KN,R_NTMAX+1) + JI = R_NTMAX+3-JN + PNSD(IR,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(IR,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(IR,JI-1,KMLOC) + PNSD(II,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(II,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(II,JI-1,KMLOC) + + ELSEIF(KM == 0) THEN + ! (DO JN=0,R_NTMAX+1) + JI = R_NTMAX+3-JN + PNSD(IR,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(IR,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(IR,JI-1,KMLOC) + ENDIF + ENDDO + ENDDO +END DO + +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif +#ifdef ACCGPU +!$ACC END DATA +#endif + +! ------------------------------------------------------------------ + +END SUBROUTINE SPNSDE +END MODULE SPNSDE_MOD diff --git a/src/trans/gpu/internal/sufft_mod.F90 b/src/trans/gpu/internal/sufft_mod.F90 new file mode 100755 index 00000000..707243a8 --- /dev/null +++ b/src/trans/gpu/internal/sufft_mod.F90 @@ -0,0 +1,48 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUFFT_MOD + CONTAINS + SUBROUTINE SUFFT + + USE PARKIND1 ,ONLY : JPIM + + USE TPM_DIM ,ONLY : R + USE TPM_GEN ,ONLY : NOUT, NPRINTLEV + USE TPM_DISTR ,ONLY : D, MYSETW + USE TPM_GEOMETRY ,ONLY : G + USE TPM_FFT ,ONLY : T + USE TPM_HICFFT ,ONLY : HICT, INIT_PLANS_FFT + ! + + IMPLICIT NONE + + INTEGER(KIND=JPIM) :: JGL,IGLG + LOGICAL :: LLP1,LLP2 + + ! ------------------------------------------------------------------ + + IF(.NOT.D%LGRIDONLY) THEN + + LLP1 = NPRINTLEV>0 + LLP2 = NPRINTLEV>1 + IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SUFFT ===' + + CALL INIT_PLANS_FFT(R%NDLON) + + ENDIF + + ! ------------------------------------------------------------------ + + 9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) + + END SUBROUTINE SUFFT +END MODULE SUFFT_MOD diff --git a/src/trans/gpu/internal/sugaw_mod.F90 b/src/trans/gpu/internal/sugaw_mod.F90 new file mode 100755 index 00000000..ef9b892f --- /dev/null +++ b/src/trans/gpu/internal/sugaw_mod.F90 @@ -0,0 +1,431 @@ +! (C) Copyright 1987- ECMWF. +! (C) Copyright 1987- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUGAW_MOD +CONTAINS +SUBROUTINE SUGAW(KDGL,KM,KN,PL,PW,PANM,PFN) + +USE PARKIND1 ,ONLY : JPRD, JPIM +USE PARKIND2 ,ONLY : JPRH + +USE TPM_CONSTANTS ,ONLY : RA + +USE TPM_GEN ,ONLY : NOUT +USE GAWL_MOD ,ONLY : GAWL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE SUPOLF_MOD +USE TPM_POL + +!**** *SUGAW * - Routine to initialize the Gaussian +! abcissa and the associated weights + +! Purpose. +! -------- +! Initialize arrays PL, and PW (quadrature abscissas and weights) +!** Interface. +! ---------- +! *CALL* *SUGAW(KN,PFN,PL,PW) * + +! Explicit arguments : +! -------------------- +! INPUT: +! KDGL : Number of Gauss abscissas +! KM : Polynomial order m +! KN : Polynomial degree n +! PFN : Fourier coefficients of series expansion for +! the ordinary Legendre polynomials +! OUTPUT: +! PL (KN) : abscissas of Gauss +! PW (KN) : Weights of the Gaussian integration + +! PL (i) is the abscissa i starting from the northern pole, it is +! the cosine of the colatitude of the corresponding row of the collocation +! grid. + +! Implicit arguments : +! -------------------- +! None + +! Method. +! ------- +! See documentation + +! Externals. +! ---------- + +! Reference. +! ---------- + +! S.L. Belousov, Tables of normalized associated Legendre Polynomials, Pergamon Press (1962) +! P.N. Swarztrauber, On computing the points and weights for Gauss-Legendre quadrature, +! SIAM J. Sci. Comput. Vol. 24 (3) pp. 945-954 (2002) + +! Author. +! ------- +! Mats Hamrud and Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 87-10-15 +! Michel Rochas : 90-08-30 +! Philippe Courtier : 92-12-19 Multitasking +! Ryad El Khatib : 94-04-20 Remove unused comdecks pardim and yomdim +! Mats Hamrud : 94-08-12 Printing level +! K. Yessad (Sep 2008): cleaning, improve comments. +! Nils Wedi + Mats Hamrud, 2009-02-05 revised following Swarztrauber, 2002 +! F. Vana 05-Mar-2015 Support for single precision +! ------------------------------------------------------------------ + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(IN) :: KDGL +INTEGER(KIND=JPIM),INTENT(IN) :: KM +INTEGER(KIND=JPIM),INTENT(IN) :: KN + +REAL(KIND=JPRD) ,INTENT(IN) :: PANM + +REAL(KIND=JPRD),INTENT(OUT) :: PW(KDGL) +REAL(KIND=JPRD),INTENT(OUT) :: PL(KDGL) + +REAL(KIND=JPRD) ,OPTIONAL, INTENT(IN) :: PFN(0:KDGL,0:KDGL) + +! ------------------------------------------------------------------ + +REAL(KIND=JPRD) :: ZLI(KDGL),ZT(KDGL),ZFN(0:KDGL/2),ZL(KDGL) +REAL(KIND=JPRD) :: ZREG(KDGL),ZMOD(KDGL),ZM(KDGL),ZRR(KDGL) +INTEGER(KIND=JPIM) :: ITER(KDGL) + +INTEGER(KIND=JPIM) :: IALLOW, INS2, ISYM, JGL, IK, IODD, I, IMAX + +REAL(KIND=JPRD) :: Z, ZEPS, Z0, ZPI + +! computations in extended precision for alternative root finding +! which also works for associated polynomials (m>0) +REAL(KIND=JPRH) :: ZLK, ZLK1, ZLLDN, ZANM +REAL(KIND=JPRH) :: ZTHETA, ZTHETA0, ZX, ZX0, ZDX0, ZH, ZPIH, ZS0 +REAL(KIND=JPRH) :: ZK1, ZK2, ZK3, ZK4 +REAL(KIND=JPRH) :: ZF1, ZF2, ZF3 +REAL(KIND=JPRH) :: FP, FQ, FP1, FQ1 +REAL(KIND=JPRH) :: X, ZXOLD, ZBIG, ZEPSH + +INTEGER(KIND=JPIM) :: ISTEPMAX + +LOGICAL :: LLP2, LLREF, LLOLD + +REAL(KIND=JPRD) :: ZDDPOL(0:KN) + +INTEGER(KIND=JPIM), PARAMETER :: JPKD=KIND(ZLK) + +FP(X) = 1._JPRH-X**2 +FQ(X) = REAL(KN*(KN+1),JPRH)-REAL(KM**2,JPRH)/(1._JPRH-X**2) +FP1(X) = -2._JPRH*X +FQ1(X) = -2._JPRH*X*REAL(KM**2,JPRH)/SQRT(1._JPRH-X**2) + +! ------------------------------------------------------------------ +! ------------------------------------------------------------------ +!* 1. Initialization + root + weight computation +! ------------------------------------------ + +LLP2 = .FALSE. +INS2 = KDGL/2 + +LLOLD=( KM == 0 .AND. KN == KDGL ).AND.PRESENT(PFN) + + +CALL GSTATS(1650,0) + +ZEPS = EPSILON(Z) +ZEPSH = EPSILON(X) + +ZBIG = SQRT(HUGE(X)) + +!* 1.1 Find the roots of the ordinary +! Legendre polynomial of degree KN using an analytical first guess +! and then refine to machine precision via Newton's method +! in double precision following Swarztrauber (2002) + +! Nils Comment: in principle the else case could also be used for this but +! this is slightly more accurate and consistent with the past + +IF( LLOLD ) THEN + + ZPI = 2.0_JPRD*ASIN(1.0_JPRD) + IODD=MOD(KDGL,2) + IK=IODD + DO JGL=IODD,KDGL,2 + ZFN(IK)=PFN(KDGL,JGL) + IK=IK+1 + ENDDO + + DO JGL=1,INS2 + Z = REAL(4*JGL-1,JPRD)*ZPI/REAL(4*KN+2,JPRD) + ! analytic initial guess for cos(theta) (same quality as RK below) + ! ZX = 1._JPRD-REAL(KN-1,JPRD)/REAL(8*KN*KN*KN,JPRD)-(1._JPRD/REAL(384*KN*KN*KN*KN))*(39._JPRD-28._JPRD/SIN(Z)**2) + ! PL(JGL) = ACOS(ZX*COS(Z)) + ZL(JGL) = Z+1.0_JPRD/(TAN(Z)*REAL(8*KN**2,JPRD)) + ZREG(JGL) = COS(Z) + ZLI(JGL) = COS(ZL(JGL)) + ENDDO + + ! refine PL here via Newton's method + + !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JGL) + DO JGL=INS2,1,-1 + CALL GAWL(ZFN,ZL(JGL),PW(JGL),ZEPS,KN,ITER(JGL),ZMOD(JGL)) + ENDDO + !$OMP END PARALLEL DO + + ! convert to physical latitude space PMU + !DIR$ IVDEP + !OCL NOVREC + DO JGL=1,INS2 + PL(JGL) = COS(ZL(JGL)) + ENDDO + +ELSE + +!* 1.2 Find the roots of the associated +! Legendre polynomial of degree KN and the associated Gaussian weights +! using a Runge-Kutta 4 integration of the Pruefer transformed Sturm-Liouville problem +! (Tygert (J. Comput. Phys. 2008) and Glaser et al., SIAM J. SCI. COMPUT. Vol. 29 (4) 1420-1438) +! + + ISTEPMAX=10 + + ZANM = REAL(PANM, JPKD) + ZPIH = 2.0_JPRH*ASIN(1.0_JPRH) + + ZX0 = 0._JPRH + Z0 = 0._JPRD + + ! first guess starting point + IF( MOD(KN-KM,2) == 0 ) THEN + ! even, extremum at X == 0 + ZTHETA0 = 0._JPRH + ZH = -0.5_JPRH*ZPIH/REAL(ISTEPMAX,JPRH) + ELSE + ! odd, root at X == 0 + ZTHETA0 = 0.5_JPRH*ZPIH + ZX0 = 0._JPRH + ZH = -ZPIH/REAL(ISTEPMAX,JPRH) + ENDIF + + ZX = ZX0 + ZTHETA = ZTHETA0 + + ZF1 = SQRT(FQ(ZX)/FP(ZX)) + ZF2 = FQ1(ZX)/FQ(ZX) + ZF3 = FP1(ZX)/FP(ZX) + + ! Formula (81) in Tygert + ZDX0=-1._JPRH/(ZF1 + 0.25_JPRH*(ZF2 + ZF3)*SIN(2._JPRH*ZTHETA)) + + ! loop over all roots + LLREF=.TRUE. + DO JGL=INS2,1,-1 + + ! runge-kutta + DGL:DO IK=1,ISTEPMAX + + ZK1 = ZDX0 + ZTHETA = ZTHETA + 0.5_JPRH*ZH + + ZX = ZX0 + 0.5_JPRH*ZH*ZK1 + + ZF1 = SQRT(FQ(ZX)/FP(ZX)) + ZF2 = FQ1(ZX)/FQ(ZX) + ZF3 = FP1(ZX)/FP(ZX) + + ZK2 = -1._JPRH/(ZF1 + 0.25_JPRH*(ZF2 + ZF3)*SIN(2._JPRH*ZTHETA)) + ZX = ZX0 + 0.5_JPRH*ZH*ZK2 + + ZF1 = SQRT(FQ(ZX)/FP(ZX)) + ZF2 = FQ1(ZX)/FQ(ZX) + ZF3 = FP1(ZX)/FP(ZX) + + ZK3 = -1._JPRH/(ZF1 + 0.25_JPRH*(ZF2 + ZF3)*SIN(2._JPRH*ZTHETA)) + ZTHETA = ZTHETA + 0.5_JPRH*ZH + ZX = ZX0 + ZH*ZK3 + + ZF1 = SQRT(FQ(ZX)/FP(ZX)) + ZF2 = FQ1(ZX)/FQ(ZX) + ZF3 = FP1(ZX)/FP(ZX) + + ZK4 = -1._JPRH/(ZF1 + 0.25_JPRH*(ZF2 + ZF3)*SIN(2._JPRH*ZTHETA)) + ZX = ZX0 + (1._JPRH/6._JPRH)*ZH*(ZK1+2._JPRH*ZK2+2._JPRH*ZK3+ZK4) + ZXOLD = ZX0 + + ZX0 = ZX + + IF( .NOT.ZX==ZX ) THEN + WRITE(NOUT,*) 'invoke overflow ...ZX ',KM, KN, JGL + ZX = ZXOLD + ZX0 = ZXOLD + EXIT DGL + ENDIF + + ZF1 = SQRT(FQ(ZX)/FP(ZX)) + ZF2 = FQ1(ZX)/FQ(ZX) + ZF3 = FP1(ZX)/FP(ZX) + + ZDX0 = -1._JPRH/(ZF1 + 0.25_JPRH*(ZF2 + ZF3)*SIN(2._JPRH*ZTHETA)) + + ENDDO DGL + +! ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +! Everything from here until <> is to refine the +! root and compute the starting point for the next root search +! ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + ! should not happen, but does if loss of accuracy in supolf occurs (useful for debugging) + IF( JGL < INS2 ) LLREF = PW(JGL+1).GT.ZEPSH + + IF( LLREF ) THEN + + ! chosen for speed/accuracy compromise + IMAX=3 + LOOP: DO I=1,IMAX + ! supol fast + ZS0 = ACOS(ZX0) + CALL SUPOLF(KM,KN,REAL(ZX0,JPRD),ZDDPOL) + ZLK=REAL(ZDDPOL(KN),JPKD) + ZLK1= REAL(ZDDPOL(KN-1),JPKD) + ZLLDN= -(ZANM*ZLK1-DDI(KN)*COS(ZS0)*ZLK)/SIN(ZS0) + + IF( ABS(ZLLDN) > ZEPSH ) THEN + ! single Newton refinement in theta + ZS0 = ZS0 - ZLK/ZLLDN + ZX = COS(ZS0) + ELSE + ! do nothing + ZX = ZX0 + ENDIF + + IF( ABS(ZX-ZX0) > 1000._JPRD*ZEPS ) THEN + ZX0 = ZX + ELSE + EXIT LOOP + ENDIF + ENDDO LOOP + + ! recompute for accuracy weights + CALL SUPOLF(KM,KN,REAL(ZX,JPRD),ZDDPOL) + ! option f in Schwarztrauber to compute the weights + ZS0 = ACOS(ZX) + ZLK=REAL(ZDDPOL(KN),JPKD) + ZLK1= REAL(ZDDPOL(KN-1),JPKD) + ZLLDN= -(ZANM*ZLK1-DDI(KN)*COS(ZS0)*ZLK)/SIN(ZS0) + + PW(JGL) = REAL(REAL(2*KN+1,JPRH)/ZLLDN**2,JPRD) + + ! catch overflow, should never happen + IF( .NOT.(PW(JGL)==PW(JGL)) ) THEN + WRITE(NOUT,*) 'invoke overflow ...PW ',KM, KN, JGL + PW(JGL) = 0.0_JPRD + ENDIF + + ELSE + ! should never happen ... + WRITE(NOUT,*) 'Refinement not possible ... PW set to 0',KM, KN, JGL + PW(JGL) = 0.0_JPRD + ENDIF + + ZX0 = ZX + PL(JGL) = REAL(ZX0,JPRD) + + ! catch overflow, should never happen + IF( .NOT.(PW(JGL)==PW(JGL)) ) THEN + WRITE(NOUT,*) 'invoke overflow ...PW ',KM, KN, JGL + PW(JGL) = 0.0_JPRD + ENDIF + +! ++++++++++++++++++++++++++++++++++++++++++++++++ +! <<<< END REFINEMENT >>>> +! ++++++++++++++++++++++++++++++++++++++++++++++++ + + ZF1 = SQRT(FQ(ZX0)/FP(ZX0)) + ZF2 = FQ1(ZX0)/FQ(ZX0) + ZF3 = FP1(ZX0)/FP(ZX0) + + ! continue to next root with refined ZX,ZR as initial condition + ZH = -ZPIH/REAL(ISTEPMAX,JPRH) + ZTHETA = 0.5_JPRH*ZPIH + ZDX0 = -1._JPRH/(ZF1 + 0.25_JPRH*(ZF2 + ZF3)*SIN(2._JPRH*ZTHETA)) + ENDDO + +ENDIF + +CALL GSTATS(1650,1) +! ------------------------------------------------------------------ + +!DIR$ IVDEP +!OCL NOVREC +DO JGL=1,KDGL/2 + ISYM = KDGL-JGL+1 + PL(ISYM) = -PL(JGL) + PW(ISYM) = PW(JGL) +ENDDO + +! ------------------------------------------------------------------ + +!* 3. Diagnostics. +! ------------ + +IF( LLOLD ) THEN + + IF(LLP2)THEN + DO JGL=1,INS2 + ZM(JGL) = (ACOS(PL(JGL))-ACOS(ZLI(JGL)))*RA + ZRR(JGL) = (ACOS(PL(JGL))-ACOS(ZREG(JGL)))*RA + ZT(JGL) = ACOS(PL(JGL))*180._JPRD/ZPI + ENDDO + ENDIF + + IALLOW = 20 + DO JGL=1,INS2 + + IF(LLP2)THEN + WRITE(UNIT=NOUT,FMT=& + &'('' M ='',I4,'' ROW ='',I4,'' ITERATIONS='',I4,'' ROOT='',F30.20,& + &'' WEIGHT='',F30.20,'' MODIF :'',E8.2)')KM,JGL,ITER(JGL),PL(JGL)& + &,PW(JGL),PL(JGL)-ZLI(JGL) + WRITE(UNIT=NOUT,FMT=& + &'(10X,'' LAST INC. : '',E8.2,'' MODIF IN M : '',F10.3,& + &'' FROM THE REGULAR GRID : '',F10.3,'' COLAT '',F10.3)')& + &ZMOD(JGL),ZM(JGL),ZRR(JGL),ZT(JGL) + ENDIF + + IF(ITER(JGL) > IALLOW)THEN + WRITE(UNIT=NOUT,FMT='('' CONVERGENCE FAILED IN SUGAW '')') + WRITE(UNIT=NOUT,FMT='('' ALLOWED : '',I4,''& + &NECESSARY : '',& + &I4)')IALLOW,ITER(JGL) + CALL ABORT_TRANS(' FAILURE IN SUGAW ') + ENDIF + + ENDDO + +ELSE + + IF(LLP2)THEN + DO JGL=1,INS2 + WRITE(UNIT=NOUT,FMT=& + &'('' M ='',I4,'' ROW ='',I4,'' ITERATIONS='',I4,'' ROOT='',F30.20,& + &'' WEIGHT='',F30.20,'' COLAT '',F10.3)')KM,JGL,0,PL(JGL),PW(JGL),& + & ACOS(PL(JGL))*180._JPRD/ZPIH + ENDDO + ENDIF + +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE SUGAW +END MODULE SUGAW_MOD diff --git a/src/trans/gpu/internal/suleg_mod.F90 b/src/trans/gpu/internal/suleg_mod.F90 new file mode 100755 index 00000000..09ea4ce5 --- /dev/null +++ b/src/trans/gpu/internal/suleg_mod.F90 @@ -0,0 +1,877 @@ +! (C) Copyright 1987- ECMWF. +! (C) Copyright 1987- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SULEG_MOD +#ifdef __NEC__ +#define SIZEOF(x) STORAGE_SIZE(x)/KIND(x) +#endif +CONTAINS +SUBROUTINE SULEG +!DEC$ OPTIMIZE:1 + +USE PARKIND_ECTRANS ,ONLY : JPRD, JPIM, JPRBT +USE PARKIND2 ,ONLY : JPRH +USE MPL_MODULE + +USE TPM_GEN +USE TPM_DIM +USE TPM_CONSTANTS +USE TPM_DISTR +USE TPM_FIELDS ,ONLY : FIELDS_RESOL, F +USE TPM_FLT +USE TPM_GEOMETRY +USE TPM_CTL +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + +USE PRE_SULEG_MOD +USE SUGAW_MOD +USE SUPOL_MOD +USE SUPOLF_MOD +USE TPM_POL +USE SUTRLE_MOD +USE SETUP_GEOM_MOD +USE SEEFMM_MIX +USE SET2PE_MOD +USE ABORT_TRANS_MOD +USE PREPSNM_MOD ,ONLY : PREPSNM +USE WRITE_LEGPOL_MOD +USE READ_LEGPOL_MOD + +!**** *SULEG * - initialize the Legendre polynomials + +! Purpose. +! -------- +! Initialize COMMON YOMLEG + +!** Interface. +! ---------- +! *CALL* *SULEG* + +! Explicit arguments : +! -------------------- + +! Implicit arguments : +! -------------------- +! COMMON YOMLEG + +! Method. +! ------- +! See documentation + +! Externals. +! ---------- +! SUGAW (Gaussian latitudes) +! SUPOLM (polynomials) +! LFI routines for external IO's +! Called by SUGEM. + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS +! +! S.L. Belousov, Tables of normalized associated Legendre Polynomials, Pergamon Press (1962) +! P.N. Swarztrauber, On computing the points and weights for Gauss-Legendre quadrature, +! SIAM J. Sci. Comput. Vol. 24 (3) pp. 945-954 (2002) + +! Author. +! ------- +! Mats Hamrud and Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 87-10-15 +! MODIFICATION : 91-04 J.M. Piriou: +! - Read gaussian latitudes and PNM on LFI +! - If file missing, computes +! 91-04 M.Hamrud: +! - IO Scheme introduced +! MODIFICATION : 91-07-03 P.Courtier suppress derivatives +! MODIFICATION : 91-07-03 P.Courtier computes RATATH and RACTHE +! MODIFICATION : 91-07-03 P.Courtier change upper limit (NSMAX+1) +! MODIFICATION : 91-07-03 P.Courtier change ordering +! Order of the PNM in the file, as in the model : +! - increasing wave numbers m +! - for a given m, from n=NSMAX+1 to m +! MODIFICATION : 92-07-02 R. Bubnova: shift RATATH calculation +! to SUGEM1 +! MODIFICATION : 92-12-17 P.Courtier multitask computations +! Modified by R. EL Khatib : 93-04-02 Set-up defaults controled by LECMWF +! MODIFICATION : 93-03-19 D.Giard : n <= NTMAX +! K. YESSAD : 93-05-11 : DLMU --> global array DRMU(NDGSA:NDGEN). +! (not stored currently on LFI files). +! MODIFICATION : 94-02-03 R. El Khatib : subroutine SULEG2 to write out +! the Leg. polynomials on workfile or LFI file +! Modification : 94-08-31 M. Tolstykh: Setup for CUD interpolation +! Modified by K. YESSAD (MARCH 1995): Extra-latitudes computations +! according to value of NDGSUR and LRPOLE only. +! + change fancy loop numbering. +! Modified 98-08-10 by K. YESSAD: removal of LRPOLE option. +! - removal of LRPOLE in YOMCT0. +! - removal of code under LRPOLE. +! R. El Khatib 11-Apr-2007 Emulation of vectorized quadruple precision +! on NEC +! Nils Wedi + Mats Hamrud, 2009-02-05 revised following Swarztrauber, 2002 +! G.Mozdzynski: March 2011 Support 2D (RW,RV) initialisation of legendre coeffs +! G.Mozdzynski: July 2012 distribute FLT initialisation over NPRTRV +! R. El Khatib 14-Jun-2013 optional computation on the stretched latitudes +! F. Vana 05-Mar-2015 Support for single precision +! Nils Wedi, 20-Apr-2015 Support dual latitude/longitude set +! T. Wilhelmsson, 22-Sep-2016 Support single precision for dual too +! ------------------------------------------------------------------ + +IMPLICIT NONE + +! LOCAL +! ------------------------------------------------------------------ +REAL(KIND=JPRD),ALLOCATABLE :: ZPNMG(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZFN(:,:) +REAL(KIND=JPRD),ALLOCATABLE :: ZLRMUZ2(:) +REAL(KIND=JPRBT) :: ZEPSNM(0:R%NTMAX+2) +REAL(KIND=JPRD) :: ZLRMUZ(R%NDGL) +REAL(KIND=JPRD) :: ZW(R%NDGL) + +REAL(KIND=JPRD) :: ZANM +REAL(KIND=JPRD) :: ZFNN +REAL(KIND=JPRD) :: ZPI, ZINC, ZOFF, ZTEMP, ZORIG, ZTHETA, ZCOS + +REAL(KIND=JPRD), ALLOCATABLE :: ZSNDBUFV(:),ZRCVBUFV(:,:) +REAL(KIND=JPRD), ALLOCATABLE :: ZPNMCDO(:,:),ZPNMCDD(:,:) +REAL(KIND=JPRBT), ALLOCATABLE :: ZRCVBUTFV(:,:) +REAL(KIND=KIND(ZRCVBUTFV)) :: ZBYTES +INTEGER(KIND=JPIM) :: IBYTES +INTEGER(KIND=JPIM) :: ISENDREQ(NPRTRV) +INTEGER(KIND=JPIM) :: IRECVREQ(NPRTRV) + +INTEGER(KIND=JPIM) :: INM, IM, IRECV, ISEND, ISREQ, IRREQ, & + &JGL, JM, JMLOC, IMLOC, JN, JNM, IODD, INN, INMAX, JI, IMAXN, ITAG, ITAG1, & + &INX, ISL, ISTART, ITHRESHOLD, INSMAX, IMAXCOLS,ILATSMAX,JW,JV,J, & + &IDGLU, ILA, ILS, IA, IS, I, ILATS, ILOOP, IPRTRV, JSETV, JH, & + &IMAXRECVA, IMAXRECVS, IHEMIS, INNH, IGL, IGL1, IGL2, & + &IDGLU2, ISYM, INZ + +REAL(KIND=JPRD) :: ZEPS_INT_DEC +REAL(KIND=JPRD) :: ZEPS +REAL(KIND=JPRD),ALLOCATABLE :: ZLFPOL(:,:) +REAL(KIND=JPRD),ALLOCATABLE :: ZLPOL(:) + +LOGICAL :: LLP1,LLP2 + +! For latitudes on the stretched geometry +REAL(KIND=JPRH) :: ZTAN +REAL(KIND=JPRH) :: ZSTRETMU(R%NDGL) + +! ------------------------------------------------------------------ + +!* 0. Some initializations. +! --------------------- + +IBYTES = MPL_BYTES(ZBYTES) + +ZEPS = 1000._JPRD*EPSILON(ZEPS) +!ZEPS_INT_DEC = EPSILON(ZEPS) +ZEPS_INT_DEC = 1.0E-7_JPRD +!ZEPS_INT_DEC = 1.0E-5_JPRD + +IHEMIS=1 +IF (S%LSOUTHPNM) IHEMIS=2 +LLP1 = NPRINTLEV>0 +LLP2 = NPRINTLEV>1 +IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SULEG ===' + +IF( NPROC > 1 )THEN + CALL GSTATS(798,0) + CALL MPL_BARRIER(CDSTRING='SULEG:') + CALL GSTATS(798,1) +ENDIF + +CALL GSTATS(140,0) +CALL GSTATS(1801,0) + +IF(.NOT.D%LGRIDONLY) THEN + CALL PRE_SULEG +ENDIF + +ALLOCATE(F%RMU(R%NDGL)) +IF (LLP2) WRITE(NOUT,9) 'F%RMU ',SIZE(F%RMU ),SHAPE(F%RMU ) +ALLOCATE(F%RW(R%NDGL)) +IF (LLP2) WRITE(NOUT,9) 'F%RW ',SIZE(F%RW ),SHAPE(F%RW ) + + +!* 1.0 Initialize Fourier coefficients for ordinary Legendre polynomials +! ------------------------------------------------------------------------ + +ALLOCATE(ZFN(0:R%NDGL,0:R%NDGL)) +IF (LLP2) WRITE(NOUT,9) 'ZFN ',SIZE(ZFN ),SHAPE(ZFN ) + + + +! determines the number of stripes in butterfly NSMAX/IMAXCOLS +! IMAXCOLS = (R%NSMAX - 1)/4 + 1 +! IMAXCOLS=64 (min flops) +IMAXCOLS=64 + +! the threshold of efficiency +IF(NPROC == 1 .OR. R%NDGNH <= 2560) THEN + ITHRESHOLD = R%NDGNH/4 + DO + IF(ITHRESHOLD >= IMAXCOLS*4) EXIT + IMAXCOLS = IMAXCOLS/2 + ENDDO +ELSE + ITHRESHOLD = 900 +ENDIF + +ITHRESHOLD = MAX(ITHRESHOLD,IMAXCOLS+1) +S%ITHRESHOLD = ITHRESHOLD + +!* 3.1 Gaussian latitudes and weights +! --------------------------------------- + +CALL INI_POL(R%NTMAX+3) + +IF(.NOT.D%LGRIDONLY) THEN + ISTART=1 +ELSE + ISTART=R%NDGL +ENDIF + +INMAX=R%NDGL +! Belousov, Swarztrauber use ZFN(0,0)=SQRT(2._JPRD) +! IFS normalisation chosen to be 0.5*Integral(Pnm**2) = 1 +ZFN(0,0)=2._JPRD +DO JN=ISTART,R%NDGL + ZFNN=ZFN(0,0) + DO JGL=1,JN + ZFNN=ZFNN*SQRT(1._JPRD-0.25_JPRD/REAL(JGL**2,JPRD)) + ENDDO + + IODD=MOD(JN,2) + ZFN(JN,JN)=ZFNN + DO JGL=2,JN-IODD,2 + ZFN(JN,JN-JGL)=ZFN(JN,JN-JGL+2)*REAL((JGL-1)*(2*JN-JGL+2),JPRD)/REAL(JGL*(2*JN-JGL+1),JPRD) + ENDDO +ENDDO + +! compute latitudes and weights for original Gaussian latitudes +ZANM=SQRT(REAL(2*INMAX+1,JPRD)*REAL(INMAX**2,JPRD)/REAL(2*INMAX-1,JPRD)) +INN=R%NDGL +CALL GSTATS(1801,2) +CALL SUGAW(INN,0,INMAX,ZLRMUZ(1:INN),ZW(1:INN),ZANM,ZFN) +CALL GSTATS(1801,3) + +IF (ABS(G%RSTRET-1.0_JPRD)>100._JPRD*EPSILON(1._JPRD)) THEN + WRITE(NOUT,*) '=== SULEG: Change Gaussian latitudes to the transformed sphere ===' + INNH=(INN+1)/2 + ZTAN=(1.0_JPRD-G%RSTRET**2)/(1.0_JPRD+G%RSTRET**2) +! North hemisphere + DO JGL=1,INNH + ZSTRETMU(JGL)=(ZTAN+REAL(ZLRMUZ(JGL),JPRH))/(1.0_JPRD+ZTAN*REAL(ZLRMUZ(JGL),JPRH)) + ENDDO +! South hemisphere + DO JGL=1,INNH + IGL=2*INNH-JGL+1 + ZSTRETMU(IGL)=(ZTAN-REAL(ZLRMUZ(JGL),JPRH))/(1.0_JPRD-ZTAN*REAL(ZLRMUZ(JGL),JPRH)) + ENDDO + DO JGL=1,INN + ZLRMUZ(JGL)=REAL(ZSTRETMU(JGL),JPRD) + ENDDO +ENDIF + +DO JGL=1,R%NDGL + F%RW(JGL) = ZW(JGL) + F%RMU(JGL) = ZLRMUZ(JGL) +ENDDO + +IF (LLP1) WRITE(NOUT,*) '=== SULEG: Finished Gaussian latitudes ===' + +!* 3.1.1 specify a dual set of output (inv_trans) or input (dir_trans) latitudes / longitudes + +IF( S%LDLL ) THEN + + INMAX = S%NDGL + INN= S%NDGL + + S%NDGNHD = (INMAX+1)/2 + ALLOCATE(ZLRMUZ2(INN)) + + ! here we want to use the positions of the specified dual grid + ! accuracy requirement is ZLRMUZ2(JGL) < F%RMU(1) + ! so we use approximations for the remaining latitudes outside this range + ! we approximate the vicinity to the pole/equator + + ZPI = 2.0_JPRD*ASIN(1.0_JPRD) + + ZORIG = ASIN(F%RMU(1)) + IF( S%LSHIFTLL ) THEN + ZINC = ZPI/REAL(INN,JPRD) + ZOFF = 0.5_JPRD*ZINC + ZTEMP = ZOFF + ZINC*REAL(S%NDGNHD-1,JPRD) + ZLRMUZ2(1) = SIN(MIN(ZTEMP,ZORIG) - 0.5_JPRD*MAX(0._JPRD,ZTEMP - ZORIG)) + ZLRMUZ2(S%NDGNHD) = SIN(ZOFF) + ELSE + ZINC = ZPI/REAL(INN-2,JPRD) + ZOFF=-0.5_JPRD*ZINC + ZTEMP = ZOFF + ZINC*REAL(S%NDGNHD-1,JPRD) + ZLRMUZ2(1) = SIN(MIN(ZTEMP,ZORIG) - 0.5_JPRD*MAX(0._JPRD,ZTEMP - ZORIG)) + ZOFF=0.01_JPRD*ZINC + ZLRMUZ2(S%NDGNHD) = SIN(ZOFF) + ZOFF=0._JPRD + ENDIF + DO JGL=2, S%NDGNHD-1 + ZLRMUZ2(JGL) = SIN(ZOFF + ZINC*REAL(S%NDGNHD-JGL,JPRD)) + ENDDO + DO JGL=1, S%NDGNHD + ISYM = INN-JGL+1 + ZLRMUZ2(ISYM) = -ZLRMUZ2(JGL) + ENDDO + + IF( LLP2 ) THEN + WRITE(NOUT,*) 'dual latitudes' + DO JGL= 1, INN + WRITE(NOUT,*) 'dual JGL=',JGL,(180._JPRD/ZPI)*ZINC,(180._JPRD/ZPI)*ASIN(ZLRMUZ2(JGL)),(180._JPRD/ZPI)*ASIN(F%RMU(JGL)) + ENDDO + ENDIF + + ALLOCATE(F%RMU2(INMAX)) + IF (LLP2) WRITE(NOUT,9) 'F%RMU2 ',SIZE(F%RMU2 ),SHAPE(F%RMU2 ) + ALLOCATE(F%RACTHE2(INMAX)) + IF (LLP2) WRITE(NOUT,9) 'F%RACTHE2 ',SIZE(F%RACTHE2),SHAPE(F%RACTHE2 ) + DO JGL=1,INN + F%RMU2(JGL) = ZLRMUZ2(JGL) + F%RACTHE2(JGL) = 1.0_JPRD/(SQRT(1.0_JPRD-ZLRMUZ2(JGL)*ZLRMUZ2(JGL))+ZEPS)/REAL(RA,JPRD) + ENDDO + + IF (LLP1) WRITE(NOUT,*) '=== SULEG: Finished dual Gaussian latitudes ===' + + ! inverse + direct map for FMM + INX=2*R%NDGNH + INZ=2*S%NDGNHD + ALLOCATE(S%FMM_INTI) + CALL SETUP_SEEFMM(INX,F%RMU,INZ,F%RMU2,S%FMM_INTI) + +ENDIF + +!* 3.2 Computes related arrays + +IF(.NOT.D%LGRIDONLY) THEN + + ALLOCATE(S%FA(D%NUMP)) + + ALLOCATE(F%R1MU2(R%NDGL)) + IF (LLP2) WRITE(NOUT,9) 'F%R1MU2 ',SIZE(F%R1MU2),SHAPE(F%R1MU2 ) + ALLOCATE(F%RACTHE(R%NDGL)) + IF (LLP2) WRITE(NOUT,9) 'F%RACTHE ',SIZE(F%RACTHE),SHAPE(F%RACTHE ) + + IF( S%LUSE_BELUSOV) THEN + ALLOCATE(F%RPNM(R%NLEI3,D%NSPOLEGL)) + IF (LLP2) WRITE(NOUT,9) 'F%RPNM ',SIZE(F%RPNM),SHAPE(F%RPNM) + DO JNM=1,D%NSPOLEGL + F%RPNM(R%NLEI3,JNM) = 0.0_JPRD + ENDDO + ENDIF + +!* 3.2 Computes related arrays + + DO JGL=1,R%NDGL +! test cosine differently + ZTHETA = ASIN(ZLRMUZ(JGL)) + ZCOS = COS(ZTHETA) + F%R1MU2(JGL) = ZCOS**2 + F%RACTHE(JGL) = 1.0_JPRD/ZCOS/REAL(RA,JPRD) +! F%R1MU2(JGL) = 1.0_JPRD-ZLRMUZ(JGL)*ZLRMUZ(JGL) +! F%RACTHE(JGL) = 1.0_JPRD/SQRT(1.0_JPRD-ZLRMUZ(JGL)*ZLRMUZ(JGL))/REAL(RA,JPRD) + ENDDO + +!* 3.3 Working arrays + +! compute the Legendre polynomials as a function of the z_k (Gaussian Latitudes) +! this may be faster than calling supolf for each m but uses extra communication +! and the parallelism is more limited ? Nils + + IF( S%LUSE_BELUSOV .AND. .NOT. C%LREAD_LEGPOL ) THEN + + INSMAX = R%NTMAX+1 + + IF( INSMAX /= R%NDGL) THEN + DEALLOCATE(ZFN) + ALLOCATE(ZFN(0:INSMAX,0:INSMAX)) + ! Belousov, Swarztrauber use ZFN(0,0)=SQRT(2._JPRD) + ! IFS normalisation chosen to be 0.5*Integral(Pnm**2) = 1 + ZFN(0,0)=2._JPRD + DO JN=1,INSMAX + ZFNN=ZFN(0,0) + DO JGL=1,JN + ZFNN=ZFNN*SQRT(1._JPRD-0.25_JPRD/REAL(JGL**2,JPRD)) + ENDDO + + IODD=MOD(JN,2) + ZFN(JN,JN)=ZFNN + DO JGL=2,JN-IODD,2 + ZFN(JN,JN-JGL)=ZFN(JN,JN-JGL+2)*REAL((JGL-1)*(2*JN-JGL+2),JPRD)/REAL(JGL*(2*JN-JGL+1),JPRD) + ENDDO + ENDDO + ENDIF + + ALLOCATE(ZLFPOL(0:INSMAX,0:INSMAX)) + ALLOCATE(ZPNMG(R%NSPOLEG)) + + DO JH=1,IHEMIS + + IF (JH==1) THEN + IGL1=D%NLATLS(MYSETW,MYSETV) + IGL2=D%NLATLE(MYSETW,MYSETV) + ELSE + IGL1=R%NDGL-D%NLATLE(MYSETW,MYSETV)+1 + IGL2=R%NDGL-D%NLATLS(MYSETW,MYSETV)+1 + ENDIF + + ILOOP=0 + DO JGL=IGL1,IGL2 + + INM = 0 + CALL SUPOL(INSMAX,ZLRMUZ(JGL),ZFN,ZLFPOL) + DO JM=0,R%NSMAX + DO JN=INSMAX,JM,-1 + INM = INM+1 + ZPNMG(INM) = ZLFPOL(JM,JN) + ENDDO + ENDDO + + CALL GSTATS(1801,2) + ILOOP = JGL-IGL1+1 + CALL SUTRLE(ZPNMG,JGL,ILOOP) + CALL GSTATS(1801,3) + + ENDDO + + ILATSMAX=0 + DO JW=1,NPRTRW + DO JV=1,NPRTRV + ILATSMAX=MAX(ILATSMAX,D%NLATLE(JW,JV)-D%NLATLS(JW,JV)+1) + ENDDO + ENDDO + + ILATS=IGL2-IGL1+1 + IF (S%LSOUTHPNM .AND. IHEMIS==1 .AND. ILATSMAX-1 >= ILATS) THEN + ! I don't know what to do for south pole. But isn't this piece of code + ! a dead stuff for poles rows ? + CALL ABORT_TRANS('SULEG: WILL BE BROKEN FOR SOUTH HEMISPHERE') + ENDIF + + DO J=ILATS,ILATSMAX-1 + ILOOP=ILOOP+1 + CALL GSTATS(1801,2) + CALL SUTRLE(ZPNMG,-1,ILOOP) + CALL GSTATS(1801,3) + ENDDO + + ENDDO + + DEALLOCATE(ZLFPOL) + IF( ALLOCATED(ZFN) ) DEALLOCATE(ZFN) + + DEALLOCATE(ZPNMG) + + IF(LLP1) WRITE(NOUT,*) '=== SULEG: Finished RPNM ===' + + ENDIF + + CALL SETUP_GEOM + + IMAXN=R%NTMAX+1 + + ITAG=MTAGLETR + ITAG1=MTAGLETR+1 + + IMAXRECVA=0 + IMAXRECVS=0 + DO JMLOC=1,D%NUMP + IM = D%MYMS(JMLOC) + ILA = (R%NSMAX-IM+2)/2 + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + ISL = MAX(R%NDGNH-G%NDGLU(IM)+1,1) + IMAXRECVA = MAX(IDGLU*ILA,IMAXRECVA) + IMAXRECVS = MAX(IDGLU*ILS,IMAXRECVS) + + !find nearest starting latitude of the dual set + IF( S%LDLL ) THEN + + INMAX=MIN(R%NTMAX+1,S%NDGL) + IDGLU2=S%NDGNHD + S%FA(JMLOC)%ISLD = 1 + LLA:DO JGL=1,S%NDGNHD-1 + IF( (ZLRMUZ2(JGL) < ZLRMUZ(ISL)) ) THEN + S%FA(JMLOC)%ISLD = JGL + IDGLU2 = S%NDGNHD-S%FA(JMLOC)%ISLD+1 + EXIT LLA + ENDIF + ENDDO LLA + + IF( .NOT. C%LREAD_LEGPOL ) THEN + ! compute auxiliary quantities for the dual mapping + + ! output data latitudes + ALLOCATE(ZPNMCDO(2*IDGLU2,2)) + !$OMP PARALLEL PRIVATE(JGL,ZLPOL) + IF (.NOT.ALLOCATED(ZLPOL)) ALLOCATE(ZLPOL(0:INMAX)) + !$OMP DO SCHEDULE(DYNAMIC,1) + DO JGL=1,2*IDGLU2 + CALL SUPOLF(IM,INMAX,ZLRMUZ2(S%FA(JMLOC)%ISLD+JGL-1),ZLPOL(0:INMAX)) + ZPNMCDO(JGL,1)=ZLPOL(INMAX-1) + ZPNMCDO(JGL,2)=ZLPOL(INMAX) + ENDDO + !$OMP END DO + IF (ALLOCATED(ZLPOL)) DEALLOCATE(ZLPOL) + !$OMP END PARALLEL + + ! internal (gg-roots) latitudes + ALLOCATE(ZPNMCDD(2*IDGLU,2)) + !$OMP PARALLEL PRIVATE(JGL,ZLPOL,JI,JN) + IF (.NOT.ALLOCATED(ZLPOL)) ALLOCATE(ZLPOL(0:INMAX)) + !$OMP DO SCHEDULE(DYNAMIC,1) + DO JGL=1,2*IDGLU + CALL SUPOLF(IM,INMAX,ZLRMUZ(ISL+JGL-1),ZLPOL(0:INMAX)) + ZPNMCDD(JGL,1)=ZLPOL(INMAX-1) + ZPNMCDD(JGL,2)=ZLPOL(INMAX) + ENDDO + !$OMP END DO + IF (ALLOCATED(ZLPOL)) DEALLOCATE(ZLPOL) + !$OMP END PARALLEL + + stop 'Error: code path not (yet) supported in GPU version' + !CALL PREPSNM(IM,JMLOC,ZEPSNM) + ALLOCATE(S%FA(JMLOC)%RPNMWI(2*IDGLU,1:2)) + DO JGL=1,2*IDGLU + ! inverse trafo + S%FA(JMLOC)%RPNMWI(JGL,1) = F%RW(ISL+JGL-1)*ZPNMCDD(JGL,1) + S%FA(JMLOC)%RPNMWI(JGL,2) = F%RW(ISL+JGL-1)*ZPNMCDD(JGL,2) + ! direct trafo needed if mapping to another set of gg roots + !S%FA(JMLOC)%RPNMWI(JGL,3) = -ZEPSNM(IMAXN)*ZPNMCDD(JGL,2) + !S%FA(JMLOC)%RPNMWI(JGL,4) = -ZEPSNM(IMAXN)*ZPNMCDD(JGL,1) + ENDDO + DEALLOCATE(ZPNMCDD) + ALLOCATE(S%FA(JMLOC)%RPNMWO(2*IDGLU2,1:2)) + DO JGL=1,2*IDGLU2 + ! inverse trafo + S%FA(JMLOC)%RPNMWO(JGL,1) = -ZEPSNM(IMAXN)*ZPNMCDO(JGL,2) + S%FA(JMLOC)%RPNMWO(JGL,2) = -ZEPSNM(IMAXN)*ZPNMCDO(JGL,1) + ! only needed in direct trafo, need if mapping to another set of roots + !S%FA(JMLOC)%RPNMWO(JGL,3) = F%RW2(S%FA(JMLOC)%ISLD+JGL-1)*ZPNMCDO(JGL,1) + !S%FA(JMLOC)%RPNMWO(JGL,4) = F%RW2(S%FA(JMLOC)%ISLD+JGL-1)*ZPNMCDO(JGL,2) + ENDDO + DEALLOCATE(ZPNMCDO) + ENDIF ! LREAD_LEGPOL + ENDIF ! LDLL + + ENDDO + + IF( S%LDLL ) THEN + DEALLOCATE(ZLRMUZ2) + ENDIF + + CALL GSTATS(1801,2) + + IF(.NOT.C%LREAD_LEGPOL) THEN + +! not correct logic + + DO JMLOC=1,D%NUMP,NPRTRV ! +++++++++++++++++++++ JMLOC LOOP +++++++++++++++++++++++ + + IPRTRV=MIN(NPRTRV,D%NUMP-JMLOC+1) + + ! --------------------anti-symmetric----------------------- + ! --------------------anti-symmetric----------------------- + ! --------------------anti-symmetric----------------------- + + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ILA = (R%NSMAX-IM+2)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + ALLOCATE(S%FA(IMLOC)%RPNMA(IDGLU,ILA)) + ENDDO + + IF( .NOT. S%LUSE_BELUSOV ) THEN + + ISREQ = 0 + IRREQ = 0 + + ALLOCATE (ZRCVBUFV(IMAXRECVA,IPRTRV)) + CALL GSTATS(851,0) + DO JSETV=1,IPRTRV + CALL SET2PE(IRECV,0,0,MYSETW,JSETV) + IF( .NOT.LMPOFF )THEN + IRREQ = IRREQ+1 + CALL MPL_RECV(ZRCVBUFV(:,JSETV),KSOURCE=NPRCIDS(IRECV), & + & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IRECVREQ(IRREQ),& + & KTAG=ITAG,CDSTRING='SULEG:') + ENDIF + ENDDO + CALL GSTATS(851,1) + + IF( JMLOC+MYSETV-1 <= D%NUMP )THEN + + IMLOC=JMLOC+MYSETV-1 + IM = D%MYMS(IMLOC) + ISL = MAX(R%NDGNH-G%NDGLU(IM)+1,1) + IA = 1+MOD(R%NSMAX-IM+2,2) + ILA = (R%NSMAX-IM+2)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + + ALLOCATE(ZSNDBUFV(IDGLU*ILA)) + + IF(MOD(IMAXN-IM,2) == 0) THEN + INMAX=IMAXN+1 + ELSE + INMAX=IMAXN + ENDIF + + CALL GSTATS(1251,0) + IF (.NOT.ALLOCATED(ZLPOL)) ALLOCATE(ZLPOL(0:INMAX)) + !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,ZLPOL,JI,JN) + DO JGL=1,IDGLU + CALL SUPOLF(IM,INMAX,ZLRMUZ(ISL+JGL-1),ZLPOL(0:INMAX),KCHEAP=3) + DO JI=1,ILA + JN=IM+2*(JI-1)+1 + ZSNDBUFV((JGL-1)*ILA+JI)=ZLPOL(JN) + ENDDO + ENDDO + !$OMP END PARALLEL DO + IF (ALLOCATED(ZLPOL)) DEALLOCATE(ZLPOL) + CALL GSTATS(1251,1) + + CALL GSTATS(851,0) + DO JSETV=1,NPRTRV + CALL SET2PE(ISEND,0,0,MYSETW,JSETV) + IF( .NOT.LMPOFF )THEN + ISREQ = ISREQ+1 + CALL MPL_SEND(ZSNDBUFV(:),KDEST=NPRCIDS(ISEND), & + & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(ISREQ),& + & KTAG=ITAG,CDSTRING='SULEG:') + ENDIF + ENDDO + CALL GSTATS(851,1) + + ENDIF + + CALL GSTATS(851,0) + IF(IRREQ > 0) THEN + CALL MPL_WAIT(KREQUEST=IRECVREQ(1:IRREQ), & + & CDSTRING='SUTRLE: SULEG') + ENDIF + + IF(ISREQ > 0) THEN + CALL MPL_WAIT(KREQUEST=ISENDREQ(1:ISREQ), & + & CDSTRING='SUTRLE: SULEG') + ENDIF + + IF( NPROC==1.AND.LMPOFF )THEN + ZRCVBUFV(1:SIZE(ZSNDBUFV(:)),1)=ZSNDBUFV(:) + ENDIF + CALL GSTATS(851,1) + + CALL GSTATS(1251,0) + !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JSETV,IMLOC,IM,ISL,IA,ILA,IDGLU,JGL,JI) + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ISL = MAX(R%NDGNH-G%NDGLU(IM)+1,1) + IA = 1+MOD(R%NSMAX-IM+2,2) + ILA = (R%NSMAX-IM+2)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + DO JGL=1,IDGLU + DO JI=1,ILA + S%FA(IMLOC)%RPNMA(JGL,ILA-JI+1)=ZRCVBUFV((JGL-1)*ILA+JI,JSETV) + ENDDO + ENDDO + ENDDO + !$OMP END PARALLEL DO + CALL GSTATS(1251,1) + + IF( ALLOCATED(ZSNDBUFV) ) DEALLOCATE(ZSNDBUFV) + IF( ALLOCATED(ZRCVBUFV) ) DEALLOCATE(ZRCVBUFV) + + ELSE + + CALL GSTATS(1251,0) + !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JSETV,IMLOC,IM,ISL,IA,ILA,IDGLU,JGL,JI) + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ISL = MAX(R%NDGNH-G%NDGLU(IM)+1,1) + IA = 1+MOD(R%NSMAX-IM+2,2) + ILA = (R%NSMAX-IM+2)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + DO JI=1,ILA + DO JGL=1,IDGLU + S%FA(IMLOC)%RPNMA(JGL,JI) = F%RPNM(ISL+JGL-1,D%NPMS(IM)+IA+(JI-1)*2) + ENDDO + ENDDO + ENDDO + !$OMP END PARALLEL DO + CALL GSTATS(1251,1) + + ENDIF + + ! --------------------symmetric----------------------- + ! --------------------symmetric----------------------- + ! --------------------symmetric----------------------- + + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + ALLOCATE(S%FA(IMLOC)%RPNMS(IDGLU,ILS)) + ENDDO + + IF( .NOT. S%LUSE_BELUSOV ) THEN + + ISREQ = 0 + IRREQ = 0 + + ALLOCATE (ZRCVBUFV(IMAXRECVS,IPRTRV)) + CALL GSTATS(851,0) + DO JSETV=1,IPRTRV + CALL SET2PE(IRECV,0,0,MYSETW,JSETV) + IF( .NOT.LMPOFF )THEN + IRREQ = IRREQ+1 + CALL MPL_RECV(ZRCVBUFV(:,JSETV),KSOURCE=NPRCIDS(IRECV), & + & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IRECVREQ(IRREQ),& + & KTAG=ITAG,CDSTRING='SULEG:') + ENDIF + ENDDO + CALL GSTATS(851,1) + + IF( JMLOC+MYSETV-1 <= D%NUMP )THEN + + IMLOC=JMLOC+MYSETV-1 + IM = D%MYMS(IMLOC) + ISL = MAX(R%NDGNH-G%NDGLU(IM)+1,1) + IS = 1+MOD(R%NSMAX-IM+1,2) + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + + ALLOCATE(ZSNDBUFV(IDGLU*ILS)) + + IF(MOD(IMAXN-IM,2) == 0) THEN + INMAX=IMAXN + ELSE + INMAX=IMAXN+1 + ENDIF + + IF (.NOT.ALLOCATED(ZLPOL)) ALLOCATE(ZLPOL(0:INMAX)) + CALL GSTATS(1251,0) + !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,ZLPOL,JI,JN) + DO JGL=1,IDGLU + CALL SUPOLF(IM,INMAX,ZLRMUZ(ISL+JGL-1),ZLPOL(0:INMAX),KCHEAP=2) + DO JI=1,ILS + JN=IM+2*(JI-1) + ZSNDBUFV((JGL-1)*ILS+JI)=ZLPOL(JN) + ENDDO + ENDDO + !$OMP END PARALLEL DO + CALL GSTATS(1251,1) + IF (ALLOCATED(ZLPOL)) DEALLOCATE(ZLPOL) + + CALL GSTATS(851,0) + DO JSETV=1,NPRTRV + CALL SET2PE(ISEND,0,0,MYSETW,JSETV) + IF( .NOT.LMPOFF )THEN + ISREQ = ISREQ+1 + CALL MPL_SEND(ZSNDBUFV(:),KDEST=NPRCIDS(ISEND), & + & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(ISREQ),& + & KTAG=ITAG,CDSTRING='SULEG:') + ENDIF + ENDDO + CALL GSTATS(851,1) + + ENDIF + + CALL GSTATS(851,0) + IF(IRREQ > 0) THEN + CALL MPL_WAIT(KREQUEST=IRECVREQ(1:IRREQ), & + & CDSTRING='SUTRLE: SULEG') + ENDIF + + IF(ISREQ > 0) THEN + CALL MPL_WAIT(KREQUEST=ISENDREQ(1:ISREQ), & + & CDSTRING='SUTRLE: SULEG') + ENDIF + IF( NPROC==1.AND.LMPOFF )THEN + ZRCVBUFV(1:SIZE(ZSNDBUFV(:)),1)=ZSNDBUFV(:) + ENDIF + CALL GSTATS(851,1) + + CALL GSTATS(1251,0) + !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JSETV,IMLOC,IM,ISL,IS,ILS,IDGLU,JGL,JI) + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ISL = MAX(R%NDGNH-G%NDGLU(IM)+1,1) + IS = 1+MOD(R%NSMAX-IM+1,2) + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + DO JGL=1,IDGLU + DO JI=1,ILS + S%FA(IMLOC)%RPNMS(JGL,ILS-JI+1)=ZRCVBUFV((JGL-1)*ILS+JI,JSETV) + ENDDO + ENDDO + ENDDO + !$OMP END PARALLEL DO + CALL GSTATS(1251,1) + + IF( ALLOCATED(ZSNDBUFV) ) DEALLOCATE(ZSNDBUFV) + IF( ALLOCATED(ZRCVBUFV) ) DEALLOCATE(ZRCVBUFV) + + ELSE + CALL GSTATS(1251,0) + !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JSETV,IMLOC,IM,ISL,IS,ILS,IDGLU,JGL,JI) + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ISL = MAX(R%NDGNH-G%NDGLU(IM)+1,1) + IS = 1+MOD(R%NSMAX-IM+1,2) + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + DO JI=1,ILS + DO JGL=1,IDGLU + S%FA(IMLOC)%RPNMS(JGL,JI) = F%RPNM(ISL+JGL-1,D%NPMS(IM)+IS+(JI-1)*2) + ENDDO + ENDDO + END DO + !$OMP END PARALLEL DO + CALL GSTATS(1251,1) + + ENDIF + + ENDDO ! +++++++++++++++++++++ END JMLOC LOOP +++++++++++++++++++++++ + + ENDIF + + CALL GSTATS(1801,3) + IF(S%LUSE_BELUSOV) DEALLOCATE(F%RPNM) + + IF(C%LWRITE_LEGPOL) CALL WRITE_LEGPOL + IF(C%LREAD_LEGPOL) CALL READ_LEGPOL + + +ENDIF +CALL GSTATS(1801,1) +CALL GSTATS(140,1) + +! ------------------------------------------------------------------ +9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) + +CALL END_POL + +END SUBROUTINE SULEG +END MODULE SULEG_MOD diff --git a/src/trans/gpu/internal/sump_trans0_mod.F90 b/src/trans/gpu/internal/sump_trans0_mod.F90 new file mode 100755 index 00000000..e61269e7 --- /dev/null +++ b/src/trans/gpu/internal/sump_trans0_mod.F90 @@ -0,0 +1,115 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUMP_TRANS0_MOD +CONTAINS +SUBROUTINE SUMP_TRANS0 + +! Set up distributed environment for the transform package (part 0) + +USE PARKIND1 ,ONLY : JPIM +USE MPL_MODULE ,ONLY : MPL_GROUPS_CREATE, MPL_MYRANK, MPL_NPROC + +USE TPM_GEN ,ONLY : NOUT, LMPOFF, NPRINTLEV +USE TPM_DISTR ,ONLY : LEQ_REGIONS, MTAGDISTGP, MTAGDISTSP, MTAGGL, & + & MTAGLETR, MTAGLG, MTAGLM, MTAGML, MTAGPART, & + & MYSETV, MYSETW, NPRCIDS, & + & NPRGPEW, NPRGPNS, NPRTRNS, NPRTRV, NPRTRW, & + & MYPROC, NPROC + +USE EQ_REGIONS_MOD ,ONLY : EQ_REGIONS, MY_REGION_EW, MY_REGION_NS, & + & N_REGIONS, N_REGIONS_EW, N_REGIONS_NS +USE PE2SET_MOD ,ONLY : PE2SET +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + +IMPLICIT NONE + +LOGICAL :: LLP1,LLP2 +INTEGER(KIND=JPIM) :: IPROC,JJ + +! ------------------------------------------------------------------ + +LLP1 = NPRINTLEV>0 +LLP2 = NPRINTLEV>1 +IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SUMP_TRANS0 ===' + + +NPROC = NPRGPNS*NPRGPEW +NPRTRNS = NPRTRW +IF(MOD(NPROC,NPRTRW) /= 0 .OR. NPRTRW > NPROC) THEN + CALL ABORT_TRANS('SUMP_TRANS0: NPROC INCONSISTENT WITH NPRTRW') +ENDIF +NPRTRV = NPROC/NPRTRW +IF(LLP1) WRITE(NOUT,*)'NPROC =',NPROC,' NPRGPNS=',NPRGPNS,' NPRGPEW=',& + & NPRGPEW,' NPRTRW=',NPRTRW,' NPRTRV=',NPRTRV + +IF(NPROC > 1 ) THEN + IPROC = MPL_NPROC() + IF(IPROC /= NPROC) THEN + WRITE(NOUT,*) 'SUMP_TRANS0: NPROC=',NPROC,' BUT MPL_NPROC RETURNS',& + & IPROC + CALL ABORT_TRANS('SUMP_TRANS0: NPROC INCONSISTENT WITH MPL_NPROC') + ENDIF + MYPROC = MPL_MYRANK() +ELSE + MYPROC = 1 +ENDIF + +IF (MYPROC > NPROC) THEN + CALL ABORT_TRANS('SUMP_TRANS0: INCONSISTENCY IN NUMBER OF PROCESSORS USED') +ENDIF + +IF( LEQ_REGIONS )THEN + ALLOCATE(N_REGIONS(NPROC+2)) + N_REGIONS(:)=0 + CALL EQ_REGIONS(NPROC) +ELSE + N_REGIONS_NS=NPRGPNS + ALLOCATE(N_REGIONS(N_REGIONS_NS)) + N_REGIONS(:)=NPRGPEW + N_REGIONS_EW=NPRGPEW +ENDIF +CALL PE2SET(MYPROC,MY_REGION_NS,MY_REGION_EW,MYSETW,MYSETV) +IF(LLP1) WRITE(NOUT,*)'MYPROC=',MYPROC,'MY_REGION_NS =',MY_REGION_NS,& + & ' MY_REGION_EW=',MY_REGION_EW,' MYSETW=',MYSETW,' MYSETV=',MYSETV + + +ALLOCATE(NPRCIDS(NPROC)) +IF(LLP2)WRITE(NOUT,9) 'NPRCIDS ',SIZE(NPRCIDS ),SHAPE(NPRCIDS ) +DO JJ=1,NPROC + NPRCIDS(JJ) = JJ +ENDDO + +! Message passing tags + +MTAGLETR = 18000 +MTAGML = 19000 +MTAGLG = 20000 +MTAGPART = 21000 +MTAGDISTSP = 22000 +MTAGGL = 23000 +MTAGLM = 24000 +MTAGDISTGP = 25000 + +! Create communicators for MPI groups + +IF (.NOT.LMPOFF) THEN + CALL MPL_GROUPS_CREATE(NPRTRW, NPRTRV) +ENDIF + +! Setup labels for timing package (gstats) + +! CF ifs/utility GSTATS_OUTPUT_IFS + +! ------------------------------------------------------------------ +9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) + +END SUBROUTINE SUMP_TRANS0 +END MODULE SUMP_TRANS0_MOD diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 new file mode 100755 index 00000000..64c907f4 --- /dev/null +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -0,0 +1,299 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUMP_TRANS_MOD +CONTAINS +SUBROUTINE SUMP_TRANS + +! Set up distributed environment for the transform package (part 2) + +! Modifications : +! P.Marguinaud : 11-Sep-2012 : Fix twice allocated pointer + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT, JPRD + +USE TPM_GEN ,ONLY : NOUT, NPRINTLEV +USE TPM_DIM ,ONLY : R +USE TPM_GEOMETRY ,ONLY : G +USE TPM_DISTR ,ONLY : D, LEQ_REGIONS, MYSETW, NPRTRNS, NPRTRW, NPROC, MYPROC + +!USE SUWAVEDI_MOD +!USE PE2SET_MOD +USE SUMPLATF_MOD ,ONLY : SUMPLATF +USE SUMPLAT_MOD ,ONLY : SUMPLAT +USE SUSTAONL_MOD ,ONLY : SUSTAONL +USE MYSENDSET_MOD ,ONLY : MYSENDSET +USE MYRECVSET_MOD ,ONLY : MYRECVSET +USE EQ_REGIONS_MOD ,ONLY : MY_REGION_NS, MY_REGION_EW, & + & N_REGIONS, N_REGIONS_EW, N_REGIONS_NS +! + +IMPLICIT NONE + +INTEGER(KIND=JPIM) :: JM +INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM +INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,OFFSET1,OFFSET2,KMLOC,KM +INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:) + +REAL(KIND=JPRBT),ALLOCATABLE :: ZDUM(:) +REAL(KIND=JPRBT) :: ZMEDIAP +REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2 + +LOGICAL :: LLP1,LLP2 + +! ------------------------------------------------------------------ + + +LLP1 = NPRINTLEV>0 +LLP2 = NPRINTLEV>1 +IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SUMP_TRANS ===' + +IF(.NOT.D%LGRIDONLY) THEN + ALLOCATE(D%NULTPP(NPRTRNS)) + IF(LLP2)WRITE(NOUT,9) 'D%NULTPP ',SIZE(D%NULTPP ),SHAPE(D%NULTPP ) + ALLOCATE(D%NPTRLS(NPRTRNS)) + IF(LLP2)WRITE(NOUT,9) 'D%NPTRLS ',SIZE(D%NPTRLS ),SHAPE(D%NPTRLS ) + ALLOCATE(D%NPROCL(R%NDGL)) + IF(LLP2)WRITE(NOUT,9) 'D%NPROCL ',SIZE(D%NPROCL ),SHAPE(D%NPROCL ) + + CALL SUMPLATF(R%NDGL,NPRTRNS,MYSETW,D%NULTPP,D%NPROCL,D%NPTRLS) + D%NDGL_FS = D%NULTPP(MYSETW) + +! Help arrays for spectral to fourier space transposition + ALLOCATE(D%NLTSGTB (NPRTRNS+1)) + IF(LLP2)WRITE(NOUT,9) 'D%NLTSGTB ',SIZE(D%NLTSGTB),SHAPE(D%NLTSGTB) + ALLOCATE(D%NLTSFTB (NPRTRNS+1)) + IF(LLP2)WRITE(NOUT,9) 'D%NLTSFTB ',SIZE(D%NLTSFTB),SHAPE(D%NLTSFTB) + ALLOCATE(D%NSTAGT0B(NPRTRNS+1)) + IF(LLP2)WRITE(NOUT,9) 'D%NSTAGT0B ',SIZE(D%NSTAGT0B),SHAPE(D%NSTAGT0B) + ALLOCATE(D%NSTAGT1B(NPRTRNS+1)) + IF(LLP2)WRITE(NOUT,9) 'D%NSTAGT1B ',SIZE(D%NSTAGT1B),SHAPE(D%NSTAGT1B) + ALLOCATE(D%MSTABF (NPRTRNS+1)) + IF(LLP2)WRITE(NOUT,9) 'D%MSTABF ',SIZE(D%MSTABF),SHAPE(D%MSTABF) + + D%NLTSGTB(:) = 0 + DO JGL=1,D%NDGL_FS + IGL = D%NPTRLS(MYSETW)+JGL-1 + DO JM=0,G%NMEN(IGL) + D%NLTSGTB(D%NPROCM(JM)) = D%NLTSGTB(D%NPROCM(JM))+1 + ENDDO + ENDDO + DO JA=1,NPRTRW + IPLAT = 0 + DO JGL=1,D%NULTPP(JA) + IGL = D%NPTRLS(JA)+JGL-1 + DO JM=1,D%NUMP + IF(IGL > R%NDGNH-G%NDGLU(D%MYMS(JM)) .AND. IGL <= R%NDGNH+G%NDGLU(D%MYMS(JM))) THEN + IPLAT = IPLAT + 1 + ENDIF + ENDDO + ENDDO + D%NLTSFTB(JA) = IPLAT + ENDDO + + DO JA=1,NPRTRW-1 + ISENDSET = MYSENDSET(NPRTRW,MYSETW,JA) + IRECVSET = MYRECVSET(NPRTRW,MYSETW,JA) + D%MSTABF(IRECVSET) = ISENDSET + ENDDO + D%MSTABF(MYSETW) = MYSETW + + ALLOCATE(D%NPNTGTB0(0:R%NSMAX,D%NDGL_FS)) + IF(LLP2)WRITE(NOUT,9) 'D%NPNTGTB0 ',SIZE(D%NPNTGTB0 ),SHAPE(D%NPNTGTB0 ) + ALLOCATE(D%NPNTGTB1(D%NUMP,R%NDGL)) + IF(LLP2)WRITE(NOUT,9) 'D%NPNTGTB1 ',SIZE(D%NPNTGTB1 ),SHAPE(D%NPNTGTB1 ) + + ! Global offsets of processors + D%NSTAGT0B(1) = 0 + D%NSTAGT1B(1) = 0 + DO JA=2,NPRTRNS + D%NSTAGT0B(JA) = D%NSTAGT0B(JA-1)+D%NLTSGTB(JA-1) + D%NSTAGT1B(JA) = D%NSTAGT1B(JA-1)+D%NLTSFTB(JA-1) + ENDDO + + ! Global size of foubuf + D%NLENGT0B = D%NSTAGT0B(NPRTRNS)+D%NLTSGTB(NPRTRNS) + D%NLENGT1B = D%NSTAGT1B(NPRTRNS)+D%NLTSFTB(NPRTRNS) + + ! Global offsets of grid points + DO JA=1,NPRTRW + IPOS = 0 + DO JGL=1,D%NULTPP(MYSETW) + IGL = D%NPTRLS(MYSETW) + JGL - 1 + DO JML=D%NPTRMS(JA),D%NPTRMS(JA)+D%NUMPP(JA)-1 + IM = D%NALLMS(JML) + IF (IM <= G%NMEN(IGL)) THEN + D%NPNTGTB0(IM,JGL) = D%NSTAGT0B(D%NPROCM(IM)) + IPOS + IPOS = IPOS+1 + ELSE + D%NPNTGTB0(IM,JGL) = -99 + ENDIF + ENDDO + ENDDO + ENDDO + + DO JA=1,NPRTRW + IPOS = 0 + DO JGL=1,D%NULTPP(JA) + IGL = D%NPTRLS(JA) + JGL - 1 + DO JM=1,D%NUMP + IM = D%MYMS(JM) + IF (IM <= G%NMEN(IGL)) THEN + D%NPNTGTB1(JM,IGL) = D%NSTAGT1B(D%NPROCL(IGL)) + IPOS + IPOS = IPOS+1 + ELSE + D%NPNTGTB1(JM,IGL) = -99 + ENDIF + ENDDO + ENDDO + ENDDO + + ! D%NSTAGT0B / D%NSTAGT1B: offset of peer rank in send/recv buffer + ! D%NLTSGTB / D%NLTSFTB : size of peer rank in send/recv buffer + ! D%NPNTGTB0 / D%NPNTGTB1: translation inp to global send buffer / recv to out buffer +ENDIF + +! GRIDPOINT SPACE + +ALLOCATE(D%NFRSTLAT(N_REGIONS_NS)) +IF(LLP2)WRITE(NOUT,9) 'D%NFRSTLAT ',SIZE(D%NFRSTLAT ),SHAPE(D%NFRSTLAT ) +ALLOCATE(D%NLSTLAT(N_REGIONS_NS)) +IF(LLP2)WRITE(NOUT,9) 'D%NLSTLAT ',SIZE(D%NLSTLAT ),SHAPE(D%NLSTLAT ) +ALLOCATE(D%NPTRLAT(R%NDGL)) +IF(LLP2)WRITE(NOUT,9) 'D%NPTRLAT ',SIZE(D%NPTRLAT ),SHAPE(D%NPTRLAT ) +ALLOCATE(D%NPTRFRSTLAT(N_REGIONS_NS)) +IF(LLP2)WRITE(NOUT,9) 'D%NPTRFRSTLAT',SIZE(D%NPTRFRSTLAT),SHAPE(D%NPTRFRSTLAT) +ALLOCATE(D%NPTRLSTLAT(N_REGIONS_NS)) +IF(LLP2)WRITE(NOUT,9)'D%NPTRLSTLAT',SIZE(D%NPTRLSTLAT),SHAPE(D%NPTRLSTLAT) +ALLOCATE(D%LSPLITLAT(R%NDGL)) +IF(LLP2)WRITE(NOUT,9) 'D%LSPLITLAT',SIZE(D%LSPLITLAT),SHAPE(D%LSPLITLAT) +ALLOCATE(D%NPROCA_GP(N_REGIONS_NS)) +IF(LLP2)WRITE(NOUT,9) 'D%NPROCA_GP',SIZE(D%NPROCA_GP),SHAPE(D%NPROCA_GP) + + +IF(.NOT.D%LWEIGHTED_DISTR) THEN + ALLOCATE(ZDUM(1)) + CALL SUMPLAT(R%NDGL,NPROC,N_REGIONS_NS,MY_REGION_NS,D%LSPLIT,LEQ_REGIONS,& + &D%NFRSTLAT,D%NLSTLAT,D%NFRSTLOFF,D%NPTRLAT,& + &D%NPTRFRSTLAT,D%NPTRLSTLAT,D%NPTRFLOFF,& + &ZDUM,D%LWEIGHTED_DISTR,ZMEDIAP,D%NPROCA_GP,& + &IMEDIAP,IRESTM,D%LSPLITLAT,MYPROC,G%NLOEN) +ELSE + CALL SUMPLAT(R%NDGL,NPROC,N_REGIONS_NS,MY_REGION_NS,D%LSPLIT,LEQ_REGIONS,& + &D%NFRSTLAT,D%NLSTLAT,D%NFRSTLOFF,D%NPTRLAT,& + &D%NPTRFRSTLAT,D%NPTRLSTLAT,D%NPTRFLOFF,& + &D%RWEIGHT,D%LWEIGHTED_DISTR,ZMEDIAP,D%NPROCA_GP,& + &IMEDIAP,IRESTM,D%LSPLITLAT,MYPROC,G%NLOEN) +ENDIF +D%NDGL_GP = D%NLSTLAT(MY_REGION_NS)-D%NFRSTLOFF + +IF (LLP1) THEN + IF(.NOT.D%LGRIDONLY) THEN + WRITE(NOUT,FMT='(/'' OUTPUT FROM ROUTINE SUMPLAT: ''/)') + WRITE(NOUT,FMT='('' D%NULTPP '')') + WRITE(NOUT,FMT='(20(1X,I4))') D%NULTPP(1:NPRTRNS) + WRITE(NOUT,FMT='('' D%NPROCL '')') + WRITE(NOUT,FMT='(20(1X,I4))') D%NPROCL(1:R%NDGL) + ENDIF + WRITE(NOUT,FMT='('' D%NFRSTLAT '')') + WRITE(NOUT,FMT='(20(1X,I4))') D%NFRSTLAT(1:N_REGIONS_NS) + WRITE(NOUT,FMT='('' D%NLSTLAT '')') + WRITE(NOUT,FMT='(20(1X,I4))') D%NLSTLAT(1:N_REGIONS_NS) + WRITE(NOUT,FMT='('' D%NFRSTLOFF D%NPTRFLOFF '')') + WRITE(NOUT,FMT='(2(1X,I6))') D%NFRSTLOFF, D%NPTRFLOFF + WRITE(NOUT,FMT='('' D%NPTRLAT '')') + WRITE(NOUT,FMT='(20(1X,I4))') D%NPTRLAT(1:R%NDGL) + WRITE(NOUT,FMT='('' D%LSPLITLAT '')') + WRITE(NOUT,FMT='(50(1X,L1))') D%LSPLITLAT(1:R%NDGL) + WRITE(NOUT,FMT='('' D%NPTRFRSTLAT '')') + WRITE(NOUT,FMT='(20(1X,I4))') D%NPTRFRSTLAT(1:N_REGIONS_NS) + WRITE(NOUT,FMT='('' D%NPTRLSTLAT '')') + WRITE(NOUT,FMT='(20(1X,I4))') D%NPTRLSTLAT(1:N_REGIONS_NS) + WRITE(NOUT,FMT='(/)') +ENDIF +ALLOCATE(D%NSTA(R%NDGL+N_REGIONS_NS-1,N_REGIONS_EW)) +IF(LLP2)WRITE(NOUT,9) 'D%NSTA ',SIZE(D%NSTA ),SHAPE(D%NSTA ) +ALLOCATE(D%NONL(R%NDGL+N_REGIONS_NS-1,N_REGIONS_EW)) +IF(LLP2)WRITE(NOUT,9) 'D%NONL ',SIZE(D%NONL ),SHAPE(D%NONL ) + +IF(.NOT.D%LWEIGHTED_DISTR) THEN + CALL SUSTAONL(IMEDIAP,IRESTM,D%LWEIGHTED_DISTR,ZDUM,ZMEDIAP,D%NPROCA_GP) +ELSE + CALL SUSTAONL(IMEDIAP,IRESTM,D%LWEIGHTED_DISTR,D%RWEIGHT,ZMEDIAP,D%NPROCA_GP) +ENDIF +! IGPTOTL is the number of grid points in each individual processor +ALLOCATE(IGPTOTL(N_REGIONS_NS,N_REGIONS_EW)) +IGPTOTL(:,:)=0 + +DO JA=1,N_REGIONS_NS + DO JB=1,N_REGIONS(JA) + IGPTOT = 0 + DO JGL=D%NPTRFRSTLAT(JA),D%NPTRLSTLAT(JA) + IGPTOT = IGPTOT+D%NONL(JGL,JB) + ENDDO + IGPTOTL(JA,JB) = IGPTOT + ENDDO +ENDDO +D%NGPTOT = IGPTOTL(MY_REGION_NS,MY_REGION_EW) +D%NGPTOTMX = MAXVAL(IGPTOTL) +D%NGPTOTG = SUM(IGPTOTL) +ALLOCATE(D%NGPTOTL(N_REGIONS_NS,N_REGIONS_EW)) +IF(LLP2)WRITE(NOUT,9) 'D%NGPTOTL ',SIZE(D%NGPTOTL ),SHAPE(D%NGPTOTL ) +D%NGPTOTL(:,:) = IGPTOTL(:,:) + +IF(.NOT.D%LGRIDONLY) THEN + ALLOCATE(D%NSTAGTF(D%NDGL_FS+1)) + IF(LLP2)WRITE(NOUT,9) 'D%NSTAGTF ',SIZE(D%NSTAGTF ),SHAPE(D%NSTAGTF ) + IOFF = 0 + DO JGL=1,D%NDGL_FS + D%NSTAGTF(JGL) = IOFF + IGL = D%NPTRLS(MYSETW) + JGL - 1 + ! Each latitude should be able to store NLON real values, or floor(NLON/2)+1 + ! complex values. Note that IOFF should always be even, because we need to + ! store complex values (i.e. 2 floats), but this is the case anyway. + ! WARNING: Extra padding changes results, potentially, though it does not + ! cause wrong results. + IOFF = IOFF + (G%NLOEN(IGL)/2+1)*2 + ENDDO + D%NSTAGTF(D%NDGL_FS+1) = IOFF + D%NLENGTF = IOFF +ENDIF + +IF(ALLOCATED(ZDUM)) DEALLOCATE(ZDUM) +DEALLOCATE(IGPTOTL) + +ALLOCATE(D%OFFSETS_GEMM1(D%NUMP+1)) +ALLOCATE(D%OFFSETS_GEMM2(D%NUMP+1)) + +OFFSET1 = 0 +OFFSET2 = 0 +DO KMLOC=1,D%NUMP + KM = D%MYMS(KMLOC) + D%OFFSETS_GEMM1(KMLOC) = OFFSET1 + D%OFFSETS_GEMM2(KMLOC) = OFFSET2 + + !KM=0 is transformed in double precision, no need to store here + IF (KM /= 0) THEN + OFFSET1 = OFFSET1 + ALIGN(G%NDGLU(KM),8) + ! N_OFFSET takes the max of the two GEMMs + OFFSET2 = OFFSET2 + ALIGN((R%NSMAX-KM+3)/2,8) + ENDIF +ENDDO +D%OFFSETS_GEMM1(D%NUMP+1) = OFFSET1 +D%OFFSETS_GEMM2(D%NUMP+1) = OFFSET2 + +! ------------------------------------------------------------------ +9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) + +END SUBROUTINE SUMP_TRANS +END MODULE SUMP_TRANS_MOD + diff --git a/src/trans/gpu/internal/sump_trans_preleg_mod.F90 b/src/trans/gpu/internal/sump_trans_preleg_mod.F90 new file mode 100755 index 00000000..78038f4e --- /dev/null +++ b/src/trans/gpu/internal/sump_trans_preleg_mod.F90 @@ -0,0 +1,149 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUMP_TRANS_PRELEG_MOD +CONTAINS +SUBROUTINE SUMP_TRANS_PRELEG + +! Set up distributed environment for the transform package (part 1) + +USE PARKIND1 ,ONLY : JPIM + +USE TPM_GEN ,ONLY : NOUT, NPRINTLEV +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D, NPRTRW, NPRTRV, MYSETW + +USE SUWAVEDI_MOD ,ONLY : SUWAVEDI +!USE ABORT_TRANS_MOD +! + +IMPLICIT NONE + +INTEGER(KIND=JPIM) :: JW,JV,JJ,JM,JMLOC,ILATPP,IRESTL,IMLOC,IDT,INM,ILAST + +INTEGER(KIND=JPIM) :: IMYMS(R%NSMAX+1),INUMTPP(NPRTRW) +INTEGER(KIND=JPIM) :: IDUMI1,IDUMI2,IDUMI3 +INTEGER(KIND=JPIM) :: IDUM2(0:R%NSMAX), IDUM3(NPRTRW+1), IDUM4(R%NSMAX+1) + +LOGICAL :: LLP1,LLP2 + +! ------------------------------------------------------------------ + +IF(.NOT.D%LGRIDONLY) THEN + +LLP1 = NPRINTLEV>0 +LLP2 = NPRINTLEV>1 +IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SUMP_TRANS_PRELEG ===' + +!* 1. Initialize partitioning of wave numbers to PEs ! +! ---------------------------------------------- + + ALLOCATE(D%NASM0(0:R%NSMAX)) + IF(LLP2)WRITE(NOUT,9) 'D%NASM0 ',SIZE(D%NASM0 ),SHAPE(D%NASM0 ) + ALLOCATE(D%NATM0(0:R%NTMAX)) + IF(LLP2)WRITE(NOUT,9) 'D%NATM0 ',SIZE(D%NATM0 ),SHAPE(D%NATM0 ) + ALLOCATE(D%NUMPP(NPRTRW)) + IF(LLP2)WRITE(NOUT,9) 'D%NUMPP ',SIZE(D%NUMPP ),SHAPE(D%NUMPP ) + ALLOCATE(D%NPOSSP(NPRTRW+1)) + IF(LLP2)WRITE(NOUT,9) 'D%NPOSSP',SIZE(D%NPOSSP ),SHAPE(D%NPOSSP ) + ALLOCATE(D%NPROCM(0:R%NSMAX)) + IF(LLP2)WRITE(NOUT,9) 'D%NPROCM',SIZE(D%NPROCM ),SHAPE(D%NPROCM ) + ALLOCATE(D%NPTRMS(NPRTRW)) + IF(LLP2)WRITE(NOUT,9) 'D%NPTRMS ',SIZE(D%NPTRMS ),SHAPE(D%NPTRMS ) + ALLOCATE(D%NALLMS(R%NSMAX+1)) + IF(LLP2)WRITE(NOUT,9) 'D%NALLMS ',SIZE(D%NALLMS ),SHAPE(D%NALLMS ) + ALLOCATE(D%NDIM0G(0:R%NSMAX)) + IF(LLP2)WRITE(NOUT,9) 'D%NDIM0G ',SIZE(D%NDIM0G ),SHAPE(D%NDIM0G ) + + CALL SUWAVEDI(R%NSMAX,R%NTMAX,NPRTRW,MYSETW,& + &D%NASM0,D%NSPOLEGL,D%NPROCM,D%NUMPP,& + &D%NSPEC,D%NSPEC2,D%NSPEC2MX,D%NPOSSP,IMYMS,& + &D%NPTRMS,D%NALLMS,D%NDIM0G) + CALL SUWAVEDI(R%NTMAX,R%NTMAX,NPRTRW,MYSETW,& + &KASM0=D%NATM0,KUMPP=INUMTPP,KSPEC2=D%NTPEC2) + + D%NUMP = D%NUMPP (MYSETW) + ALLOCATE(D%MYMS(D%NUMP)) + IF(LLP2)WRITE(NOUT,9) 'D%MYMS ',SIZE(D%MYMS ),SHAPE(D%MYMS ) + D%MYMS(:) = IMYMS(1:D%NUMP) + D%NUMTP = INUMTPP(MYSETW) + ALLOCATE(D%NLATLS(NPRTRW,NPRTRV)) + IF(LLP2)WRITE(NOUT,9) 'D%NLATLS',SIZE(D%NLATLS ),SHAPE(D%NLATLS ) + ALLOCATE(D%NLATLE(NPRTRW,NPRTRV)) + IF(LLP2)WRITE(NOUT,9) 'D%NLATLE',SIZE(D%NLATLE ),SHAPE(D%NLATLE ) + + D%NLATLS(:,:) = 999999 + D%NLATLE(:,:) = -1 + + ILATPP = R%NDGNH/NPRTRW + IRESTL = R%NDGNH-NPRTRW*ILATPP + DO JW=1,NPRTRW + IF (JW > IRESTL) THEN + D%NLATLS(JW,1) = IRESTL*(ILATPP+1)+(JW-IRESTL-1)*ILATPP+1 + D%NLATLE(JW,1) = D%NLATLS(JW,1)+ILATPP-1 + ELSE + D%NLATLS(JW,1) = (JW-1)*(ILATPP+1)+1 + D%NLATLE(JW,1) = D%NLATLS(JW,1)+ILATPP + ENDIF + ENDDO + ILAST=0 + DO JW=1,NPRTRW + ILATPP = (D%NLATLE(JW,1)-D%NLATLS(JW,1)+1)/NPRTRV + IRESTL = (D%NLATLE(JW,1)-D%NLATLS(JW,1)+1)-NPRTRV*ILATPP + DO JV=1,NPRTRV + IF (JV > IRESTL) THEN + D%NLATLS(JW,JV) = IRESTL*(ILATPP+1)+(JV-IRESTL-1)*ILATPP+1+ILAST + D%NLATLE(JW,JV) = D%NLATLS(JW,JV)+ILATPP-1 + ELSE + D%NLATLS(JW,JV) = (JV-1)*(ILATPP+1)+1+ILAST + D%NLATLE(JW,JV) = D%NLATLS(JW,JV)+ILATPP + ENDIF + ENDDO + ILAST=D%NLATLE(JW,NPRTRV) + ENDDO + + IF (LLP1) THEN + DO JW=1,NPRTRW + DO JV=1,NPRTRV + WRITE(NOUT,'(" JW=",I6," JV=",I6," D%NLATLS=",I6," D%NLATLE=",I6)')& + & JW,JV,D%NLATLS(JW,JV),D%NLATLE(JW,JV) + ENDDO + ENDDO + ENDIF + + ALLOCATE(D%NPMT(0:R%NSMAX)) + IF(LLP2)WRITE(NOUT,9) 'D%NPMT ',SIZE(D%NPMT ),SHAPE(D%NPMT ) + ALLOCATE(D%NPMS(0:R%NSMAX)) + IF(LLP2)WRITE(NOUT,9) 'D%NPMS ',SIZE(D%NPMS ),SHAPE(D%NPMS ) + ALLOCATE(D%NPMG(0:R%NSMAX)) + IF(LLP2)WRITE(NOUT,9) 'D%NPMG ',SIZE(D%NPMG ),SHAPE(D%NPMG ) + IDT = R%NTMAX-R%NSMAX + INM = 0 + DO JMLOC=1,D%NUMP + IMLOC = D%MYMS(JMLOC) + D%NPMT(IMLOC) = INM + D%NPMS(IMLOC) = INM+IDT + INM = INM+R%NTMAX+2-IMLOC + ENDDO + INM = 0 + DO JM=0,R%NSMAX + D%NPMG(JM) = INM + INM = INM+R%NTMAX+2-JM + ENDDO + + D%NLEI3D = (R%NLEI3-1)/NPRTRW+1 + +ENDIF + +! ------------------------------------------------------------------ +9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) + +END SUBROUTINE SUMP_TRANS_PRELEG +END MODULE SUMP_TRANS_PRELEG_MOD diff --git a/src/trans/gpu/internal/sumplat_mod.F90 b/src/trans/gpu/internal/sumplat_mod.F90 new file mode 100755 index 00000000..effffa23 --- /dev/null +++ b/src/trans/gpu/internal/sumplat_mod.F90 @@ -0,0 +1,256 @@ +! (C) Copyright 1995- ECMWF. +! (C) Copyright 1995- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUMPLAT_MOD +CONTAINS +SUBROUTINE SUMPLAT(KDGL,KPROC,KPROCA,KMYSETA,LDSPLIT,LDEQ_REGIONS,& + & KFRSTLAT,KLSTLAT,KFRSTLOFF,KPTRLAT,& + & KPTRFRSTLAT,KPTRLSTLAT,KPTRFLOFF,& + & PWEIGHT,LDWEIGHTED_DISTR,PMEDIAP,KPROCAGP,& + & KMEDIAP,KRESTM,LDSPLITLAT,KMYPROC,KLOEN) + +!**** *SUMPLAT * - Initialize gridpoint distrbution in N-S direction + +! Purpose. +! -------- + + +!** Interface. +! ---------- +! *CALL* *SUMPLAT * + +! Explicit arguments - input : +! -------------------- +! KDGL -last latitude +! KPROC -total number of processors +! KPROCA -number of processors in A direction +! KMYSETA -process number in A direction +! LDSPLIT -true for latitudes shared between sets +! LDEQ_REGIONS -true if eq_regions partitioning +! PWEIGHT -weight per grid-point if weighted distribution +! LDWEIGHTED_DISTR -true if weighted distribution + +! Explicit arguments - output: +! -------------------- +! PMEDIAP -mean weight per PE if weighted distribution +! KMEDIAP -mean number of grid points per PE +! KPROCAGP -number of grid points per A set +! KRESTM -number of PEs with one extra point +! KFRSTLAT -first latitude row on processor +! KLSTLAT -last latitude row on processor +! KFRSTLOFF -offset for first latitude in set +! KPTRLAT -pointer to start of latitude +! KPTRFRSTLAT-pointer to first latitude +! KPTRLSTLAT -pointer to last latitude +! KPTRFLOFF -offset for pointer to first latitude +! LDSPLITLAT -true for latitudes which are split + +! Implicit arguments : +! -------------------- + + +! Method. +! ------- +! See documentation + +! Externals. SUMPLATB and SUEMPLATB. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! MPP Group *ECMWF* + +! Modifications. +! -------------- +! Original : 95-10-01 +! David Dent:97-06-02 parameters KFRSTLAT etc added +! JF. Estrade:97-11-13 Adaptation to ALADIN case +! J.Boutahar: 98-07-06 phasing with CY19 +! Modified 98-08-10 by K. YESSAD: removal of LRPOLE option + cleanings +! (correct computation of extrapolar latitudes for KPROCL). +! Modified 98-12-07 by K. YESSAD and C. FISCHER: cleaning. +! - merge old sumplat.F and suemplat.F +! - gather 'lelam' code and 'not lelam' code. +! - clean (useless duplication of variables, non doctor features). +! - remodularise according to lelam/not lelam +! -> lelam features in new routine suemplatb.F, +! not lelam features in new routine sumplatb.F +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + +USE TPM_GEOMETRY ,ONLY : G +USE TPM_DISTR ,ONLY : MYPROC + +USE SUMPLATB_MOD ,ONLY : SUMPLATB +USE SUMPLATBEQ_MOD ,ONLY : SUMPLATBEQ +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +! + +IMPLICIT NONE + + +! * DUMMY: +REAL(KIND=JPRBT),INTENT(OUT) :: PMEDIAP +INTEGER(KIND=JPIM),INTENT(OUT) :: KMEDIAP +INTEGER(KIND=JPIM),INTENT(OUT) :: KRESTM +INTEGER(KIND=JPIM),INTENT(IN) :: KDGL +INTEGER(KIND=JPIM),INTENT(IN) :: KPROC +INTEGER(KIND=JPIM),INTENT(IN) :: KPROCA +INTEGER(KIND=JPIM),INTENT(IN) :: KMYSETA +REAL(KIND=JPRBT),INTENT(IN) :: PWEIGHT(:) +LOGICAL,INTENT(INOUT) :: LDWEIGHTED_DISTR +INTEGER(KIND=JPIM),INTENT(OUT) :: KFRSTLAT(:) +INTEGER(KIND=JPIM),INTENT(OUT) :: KLSTLAT(:) +INTEGER(KIND=JPIM),INTENT(OUT) :: KFRSTLOFF +INTEGER(KIND=JPIM),INTENT(OUT) :: KPTRLAT(:) +INTEGER(KIND=JPIM),INTENT(OUT) :: KPTRFRSTLAT(:) +INTEGER(KIND=JPIM),INTENT(OUT) :: KPTRLSTLAT(:) +INTEGER(KIND=JPIM),INTENT(OUT) :: KPTRFLOFF +INTEGER(KIND=JPIM),INTENT(OUT) :: KPROCAGP(KPROCA) +LOGICAL,INTENT(IN) :: LDSPLIT +LOGICAL,INTENT(IN) :: LDEQ_REGIONS +LOGICAL,INTENT(OUT) :: LDSPLITLAT(:) +INTEGER(KIND=JPIM),INTENT(IN) :: KMYPROC +INTEGER(KIND=JPIM),INTENT(IN) :: KLOEN(KDGL) + +! * LOCAL: +! === END OF INTERFACE BLOCK === +INTEGER(KIND=JPIM) :: INDIC(KPROCA),ILAST(KPROCA) + +! LOCAL INTEGER SCALARS +INTEGER(KIND=JPIM) :: IPTRLATITUDE, JA, JGL + +LOGICAL :: LLFOURIER +LOGICAL :: LLDEBUG=.FALSE. + +! ----------------------------------------------------------------- + +!* 1. CODE DEPENDING ON 'LELAM': COMPUTATION OF +! KMEDIAP, KRESTM, INDIC, ILAST. +! ----------------------------------------- +INDIC(:)=0 +ILAST(:)=0 + +IF(LDWEIGHTED_DISTR.AND..NOT.LDEQ_REGIONS)THEN + CALL ABORT_TRANS ('SUMPLAT: LDWEIGHTED_DISTR=T AND LDEQ_REGIONS=F NOT SUPPORTED') +ENDIF + +IF( LDEQ_REGIONS )THEN + CALL SUMPLATBEQ(1,KDGL,KPROC,KPROCA,KLOEN,LDSPLIT,LDEQ_REGIONS,& + &PWEIGHT,LDWEIGHTED_DISTR,PMEDIAP,KPROCAGP,& + &KMEDIAP,KRESTM,INDIC,ILAST) +ELSE + LLFOURIER=.FALSE. + CALL SUMPLATB(1,KDGL,KPROCA,KLOEN,LDSPLIT,LLFOURIER,& + &KMEDIAP,KRESTM,INDIC,ILAST) +ENDIF + +! ----------------------------------------------------------------- + +!* 2. CODE NOT DEPENDING ON 'LELAM': COMPUTATION OF +! KFRSTLAT TO LDSPLITLAT. +! --------------------------------------------- + + +! * Computation of first and last latitude of processor sets +! ----------- in grid-point-space ----------------------- + +IF(KMYPROC==1.AND.LLDEBUG)THEN + WRITE(0,'("")') + WRITE(0,'("SUMPLAT_MOD:LDWEIGHTED_DISTR=",L1)')LDWEIGHTED_DISTR + WRITE(0,'("")') + DO JA=1,KPROCA + WRITE(0,'("SUMPLAT_MOD: JA=",I5," ILAST=",I5," INDIC=",I5)')& + &JA,ILAST(JA),INDIC(JA) + ENDDO + WRITE(0,'("")') + IF( LDEQ_REGIONS .AND. LDSPLIT )THEN + DO JA=1,KPROCA + WRITE(0,'("SUMPLAT_MOD: JA=",I5," KPROCAGP=",I12)')& + &JA,KPROCAGP(JA) + ENDDO + WRITE(0,'("")') + ENDIF +ENDIF + +KFRSTLAT(1) = 1 +KLSTLAT(KPROCA) = KDGL +DO JA=1,KPROCA-1 + IF ((.NOT. LDSPLIT) .OR. INDIC(JA) == 0) THEN + KFRSTLAT(JA+1) = ILAST(JA) + 1 + KLSTLAT(JA) = ILAST(JA) + ELSE + KFRSTLAT(JA+1) = INDIC(JA) + KLSTLAT(JA) = INDIC(JA) + ENDIF +ENDDO +KFRSTLOFF=KFRSTLAT(KMYSETA)-1 + +! * Initialise following data structures:- +! NPTRLAT (pointer to the start of each latitude) +! LSPLITLAT (TRUE if latitude is split over two A sets) +! NPTRFRSTLAT (pointer to the first latitude of each A set) +! NPTRLSTLAT (pointer to the last latitude of each A set) + +DO JGL=1,KDGL + KPTRLAT (JGL)=-999 + LDSPLITLAT(JGL)=.FALSE. +ENDDO +IPTRLATITUDE=0 +DO JA=1,KPROCA + DO JGL=KFRSTLAT(JA),KLSTLAT(JA) + IPTRLATITUDE=IPTRLATITUDE+1 + LDSPLITLAT(JGL)=.TRUE. + IF( KPTRLAT(JGL) == -999 )THEN + KPTRLAT(JGL)=IPTRLATITUDE + LDSPLITLAT(JGL)=.FALSE. + ENDIF + ENDDO +ENDDO +DO JA=1,KPROCA + IF( LDSPLITLAT(KFRSTLAT(JA)) .AND. JA /= 1)THEN + KPTRFRSTLAT(JA)=KPTRLAT(KFRSTLAT(JA))+1 + ELSE + KPTRFRSTLAT(JA)=KPTRLAT(KFRSTLAT(JA)) + ENDIF + IF( LDSPLITLAT(KLSTLAT(JA)) .AND. JA == KPROCA)THEN + KPTRLSTLAT(JA)=KPTRLAT(KLSTLAT(JA))+1 + ELSE + KPTRLSTLAT(JA)=KPTRLAT(KLSTLAT(JA)) + ENDIF +ENDDO +KPTRFLOFF=KPTRFRSTLAT(KMYSETA)-1 + +IF(KMYPROC==1.AND.LLDEBUG)THEN + DO JGL=1,KDGL + WRITE(0,'("SUMPLAT_MOD: JGL=",I5," KPTRLAT=",I5," LDSPLITLAT=",L4)')& + & JGL,KPTRLAT(JGL),LDSPLITLAT(JGL) + ENDDO + DO JA=1,KPROCA + WRITE(0,'("SUMPLAT_MOD: JA=",I5," KFRSTLAT=",I5," KLSTLAT=",I5,& + & " KPTRFRSTLAT=",I5," KPTRLSTLAT=",I5," KLSTLAT-KFRSTLAT=",I5,& + & " SUM(G%NLOEN(KFRSTLAT:KLSTLAT))=",I10)')& + & JA,KFRSTLAT(JA),KLSTLAT(JA),KPTRFRSTLAT(JA),KPTRLSTLAT(JA),& + & KLSTLAT(JA)-KFRSTLAT(JA),SUM(G%NLOEN(KFRSTLAT(JA):KLSTLAT(JA))) + ENDDO +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE SUMPLAT +END MODULE SUMPLAT_MOD + + + diff --git a/src/trans/gpu/internal/sumplatb_mod.F90 b/src/trans/gpu/internal/sumplatb_mod.F90 new file mode 100755 index 00000000..fb5033ac --- /dev/null +++ b/src/trans/gpu/internal/sumplatb_mod.F90 @@ -0,0 +1,226 @@ +! (C) Copyright 1998- ECMWF. +! (C) Copyright 1998- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUMPLATB_MOD +CONTAINS +SUBROUTINE SUMPLATB(KDGSA,KDGL,KPROCA,KLOENG,LDSPLIT,LDFOURIER,& + & KMEDIAP,KRESTM,KINDIC,KLAST) + +!**** *SUMPLATB * - Routine to initialize parallel environment + +! Purpose. +! -------- + + +!** Interface. +! ---------- +! *CALL* *SUMPLATB * + +! Explicit arguments - input : +! -------------------- +! KDGSA -first latitude (grid-space) +! (may be different from NDGSAG) +! KDGL -last latitude +! KPROCA -number of processors in A direction +! KLOENG -actual number of longitudes per latitude. +! LDSPLIT -true for latitudes shared between sets +! LDFOURIER -true for fourier space partitioning + +! Explicit arguments - output: +! -------------------- +! KMEDIAP -mean number of grid points per PE +! KRESTM -number of PEs with one extra point +! KINDIC -intermediate quantity for 'sumplat' +! KLAST -intermediate quantity for 'sumplat' + +! Implicit arguments : +! -------------------- + + +! Method. +! ------- +! See documentation + +! Externals. NONE. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! K. YESSAD (after old version of sumplat.F). + +! Modifications. +! -------------- +! Original : 98-12-07 +! G. Mozdzynski (August 2012): rewrite of fourier latitude distribution +! ------------------------------------------------------------------ + + +USE PARKIND_ECTRANS ,ONLY : JPIM, JPIB, JPRBT + +USE TPM_DISTR +USE ABORT_TRANS_MOD + +IMPLICIT NONE + + +! * DUMMY: +INTEGER(KIND=JPIM),INTENT(IN) :: KDGSA +INTEGER(KIND=JPIM),INTENT(IN) :: KDGL +INTEGER(KIND=JPIM),INTENT(IN) :: KPROCA +INTEGER(KIND=JPIM),INTENT(IN) :: KLOENG(KDGSA:KDGL) +LOGICAL,INTENT(IN) :: LDSPLIT +LOGICAL,INTENT(IN) :: LDFOURIER +INTEGER(KIND=JPIM),INTENT(OUT) :: KMEDIAP +INTEGER(KIND=JPIM),INTENT(OUT) :: KRESTM +INTEGER(KIND=JPIM),INTENT(OUT) :: KINDIC(KPROCA) +INTEGER(KIND=JPIM),INTENT(OUT) :: KLAST(KPROCA) + +! * LOCAL: +INTEGER(KIND=JPIB) :: ICOST(KDGSA:KDGL) +INTEGER(KIND=JPIM) :: ILATS(KPROCA) + +! LOCAL INTEGER SCALARS +INTEGER(KIND=JPIM) :: ICOMP, IGL, JA, JGL, ILAST, IREST, IA +INTEGER(KIND=JPIM) :: ITOT_TOP, ITOT_BOT, IGL_TOP, IGL_BOT +INTEGER(KIND=JPIB) :: IMEDIA,ITOT +REAL(KIND=JPRBT) :: ZLG +LOGICAL :: LLDONE,LLSIMPLE + +! ----------------------------------------------------------------- + +!* 1. COMPUTATION OF KMEDIAP, KRESTM, KINDIC, KLAST. +! ---------------------------------------------- + +! * Computation of KMEDIAP and KRESTM. + +IF( LDFOURIER )THEN + +! DO JGL=1,KDGL +! ZLG=LOG(FLOAT(KLOENG(JGL))) +! ICOST(JGL)=KLOENG(JGL)*ZLG*SQRT(ZLG) +! ENDDO + + DO JGL=1,KDGL + ICOST(JGL)=KLOENG(JGL) + ENDDO + +ELSE + + DO JGL=1,KDGL + ICOST(JGL)=KLOENG(JGL) + ENDDO + +ENDIF + +IMEDIA = SUM(ICOST(KDGSA:KDGL)) +KMEDIAP = IMEDIA / KPROCA +KRESTM = IMEDIA - KMEDIAP * KPROCA +IF (KRESTM > 0) KMEDIAP = KMEDIAP + 1 + +! * Computation of intermediate quantities KINDIC and KLAST + +KINDIC(:)=0 +KLAST(:)=0 + +IF (LDSPLIT) THEN + + IREST = 0 + ILAST =0 + DO JA=1,KPROCA + IF (JA <= KRESTM .OR. KRESTM == 0) THEN + ICOMP = KMEDIAP + ELSE + ICOMP = KMEDIAP - 1 + ENDIF + ITOT = IREST + IGL = ILAST+1 + DO JGL=IGL,KDGL + ILAST = JGL + IF(ITOT+ICOST(JGL) < ICOMP) THEN + ITOT = ITOT+ICOST(JGL) + ELSEIF(ITOT+ICOST(JGL) == ICOMP) THEN + IREST = 0 + KLAST(JA) = JGL + KINDIC(JA) = 0 + EXIT + ELSE + IREST = ICOST(JGL) -(ICOMP-ITOT) + KLAST(JA) = JGL + KINDIC(JA) = JGL + EXIT + ENDIF + ENDDO + ENDDO + +ELSE + + ITOT_TOP=0 + ITOT_BOT=0 + IGL_TOP=1 + IGL_BOT=KDGL + DO JA=1,(KPROCA-1)/2+1 + IF( JA /= KPROCA/2+1 )THEN + LLDONE=.TRUE. + DO WHILE ( LLDONE ) + IF( ITOT_TOP+ICOST(IGL_TOP) < KMEDIAP )THEN + KLAST(JA)=IGL_TOP + ITOT_TOP=ITOT_TOP+ICOST(IGL_TOP) + IGL_TOP=IGL_TOP+1 + ELSE + ITOT_TOP=ITOT_TOP-KMEDIAP + LLDONE=.FALSE. + ENDIF + ENDDO + KLAST(KPROCA-JA+1)=IGL_BOT + LLDONE=.TRUE. + DO WHILE ( LLDONE ) + IF( ITOT_BOT+ICOST(IGL_BOT) < KMEDIAP )THEN + ITOT_BOT=ITOT_BOT+ICOST(IGL_BOT) + IGL_BOT=IGL_BOT-1 + ELSE + ITOT_BOT=ITOT_BOT-KMEDIAP + LLDONE=.FALSE. + ENDIF + ENDDO + ELSE + KLAST(JA)=IGL_BOT + ENDIF + ENDDO + + LLSIMPLE=.FALSE. + DO JA=1,KPROCA + IF( KLAST(JA)==0 )THEN + LLSIMPLE=.TRUE. + EXIT + ENDIF + ENDDO + IF( LLSIMPLE )THEN +! WRITE(0,'("SUMPLATB_MOD: REVERTING TO SIMPLE LATITUDE DISTRIBUTION")') + ILATS(:)=0 + IA=0 + DO JGL=1,KDGL + IA=IA+1 + ILATS(IA)=ILATS(IA)+1 + IF( IA==KPROCA ) IA=0 + ENDDO + KLAST(1)=ILATS(1) + DO JA=2,KPROCA + KLAST(JA)=KLAST(JA-1)+ILATS(JA) + ENDDO + ENDIF + +ENDIF + +END SUBROUTINE SUMPLATB +END MODULE SUMPLATB_MOD diff --git a/src/trans/gpu/internal/sumplatbeq_mod.F90 b/src/trans/gpu/internal/sumplatbeq_mod.F90 new file mode 100755 index 00000000..17387736 --- /dev/null +++ b/src/trans/gpu/internal/sumplatbeq_mod.F90 @@ -0,0 +1,289 @@ +! (C) Copyright 2006- ECMWF. +! (C) Copyright 2006- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUMPLATBEQ_MOD +CONTAINS +SUBROUTINE SUMPLATBEQ(KDGSA,KDGL,KPROC,KPROCA,KLOENG,LDSPLIT,LDEQ_REGIONS,& + &PWEIGHT,LDWEIGHTED_DISTR,PMEDIAP,KPROCAGP,& + &KMEDIAP,KRESTM,KINDIC,KLAST) + +!**** *SUMPLATBEQ * - Routine to initialize parallel environment +! (latitude partitioning for LEQ_REGIONS=T) + +! Purpose. +! -------- + + +!** Interface. +! ---------- +! *CALL* *SUMPLATBEQ * + +! Explicit arguments - input : +! -------------------- +! KDGSA -first latitude (grid-space) +! (may be different from NDGSAG) +! KDGL -last latitude +! KPROC -total number of processors +! KPROCA -number of processors in A direction +! KLOENG -actual number of longitudes per latitude. +! LDSPLIT -true for latitudes shared between sets +! LDEQ_REGIONS -true if eq_regions partitioning +! PWEIGHT -weight per grid-point if weighted distribution +! LDWEIGHTED_DISTR -true if weighted distribution + +! Explicit arguments - output: +! -------------------- +! PMEDIAP -mean weight per PE if weighted distribution +! KMEDIAP -mean number of grid points per PE +! KPROCAGP -number of grid points per A set +! KRESTM -number of PEs with one extra point +! KINDIC -intermediate quantity for 'sumplat' +! KLAST -intermediate quantity for 'sumplat' + +! Implicit arguments : +! -------------------- + + +! Method. +! ------- +! See documentation + +! Externals. NONE. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! G. Mozdzynski + +! Modifications. +! -------------- +! Original : April 2006 +! ------------------------------------------------------------------ + + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + +USE TPM_DISTR ,ONLY : MYPROC +USE EQ_REGIONS_MOD ,ONLY : N_REGIONS +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +! + +IMPLICIT NONE + + +! * DUMMY: +INTEGER(KIND=JPIM),INTENT(IN) :: KDGSA +INTEGER(KIND=JPIM),INTENT(IN) :: KDGL +INTEGER(KIND=JPIM),INTENT(IN) :: KPROC +INTEGER(KIND=JPIM),INTENT(IN) :: KPROCA +INTEGER(KIND=JPIM),INTENT(IN) :: KLOENG(KDGSA:KDGL) +REAL(KIND=JPRBT), INTENT(IN) :: PWEIGHT(:) +LOGICAL,INTENT(IN) :: LDSPLIT +LOGICAL,INTENT(IN) :: LDEQ_REGIONS +LOGICAL,INTENT(INOUT) :: LDWEIGHTED_DISTR +REAL(KIND=JPRBT), INTENT(OUT) :: PMEDIAP +INTEGER(KIND=JPIM),INTENT(OUT) :: KMEDIAP +INTEGER(KIND=JPIM),INTENT(OUT) :: KRESTM +INTEGER(KIND=JPIM),INTENT(OUT) :: KINDIC(KPROCA) +INTEGER(KIND=JPIM),INTENT(OUT) :: KLAST(KPROCA) +INTEGER(KIND=JPIM),INTENT(OUT) :: KPROCAGP(KPROCA) + +! * LOCAL: + +! LOCAL INTEGER SCALARS +INTEGER(KIND=JPIM) :: ICOMP, IGL, IMAXI, IMEDIA, IMEDIAP, ITOT, JA, JB, IA, JGL,& + &ILAST,IREST,IPE,I2REGIONS,IGP +REAL(KIND=JPRBT) :: ZMEDIA, ZCOMP +LOGICAL :: LLDONE + +! ----------------------------------------------------------------- + +!* 1. COMPUTATION OF KMEDIAP, KRESTM, KINDIC, KLAST. +! ---------------------------------------------- +100 CONTINUE +! * Computation of KMEDIAP and KRESTM. + +IF (.NOT.LDWEIGHTED_DISTR) THEN + + IMEDIA = SUM(KLOENG(KDGSA:KDGL)) + KMEDIAP = IMEDIA / KPROC + + IF( KPROC > 1 )THEN +! test if KMEDIAP is too small and no more than 2 asets would be required +! for the first latitude + IF( LDSPLIT )THEN + I2REGIONS=N_REGIONS(1)+N_REGIONS(2) + IF( KMEDIAP < (KLOENG(KDGSA)-1)/I2REGIONS+1 )THEN + WRITE(0,'("SUMPLATBEQ: KMEDIAP=",I6," I2REGIONS=",I3," KLOENG(KDGSA)=",I4)')& + &KMEDIAP,I2REGIONS,KLOENG(KDGSA) + CALL ABORT_TRANS ('SUMPLATBEQ: NPROC TOO BIG FOR THIS RESOLUTION, LDSPLIT=T') + ENDIF + ELSE +! test for number asets too large for the number of latitudes + IF( KPROCA > KDGL )THEN + WRITE(0,'("SUMPLATBEQ: KMEDIAP=",I6," KPROCA=",I4," KDGL=",I4)')& + &KMEDIAP,KPROCA,KDGL + CALL ABORT_TRANS ('SUMPLATBEQ: NPROC TOO BIG FOR THIS RESOLUTION, LDSPLIT=F') + ENDIF + ENDIF + ENDIF + + KRESTM = IMEDIA - KMEDIAP * KPROC + IF (KRESTM > 0) KMEDIAP = KMEDIAP + 1 + +ELSE + + ZMEDIA = SUM(PWEIGHT(:)) + PMEDIAP = ZMEDIA / KPROC + +ENDIF + +! * Computation of intermediate quantities KINDIC and KLAST + +IF (LDSPLIT) THEN + + KPROCAGP(:)=0 + IREST = 0 + ILAST =0 + IPE=0 + ZCOMP=0 + IGP=0 + DO JA=1,KPROCA + ICOMP=0 + DO JB=1,N_REGIONS(JA) + IF( LDWEIGHTED_DISTR )THEN + DO WHILE ( ( JA == KPROCA .OR. ZCOMP < PMEDIAP ) .AND. IGP < SIZE(PWEIGHT) ) + IGP = IGP + 1 + ICOMP = ICOMP + 1 + ZCOMP = ZCOMP + PWEIGHT(IGP) + ENDDO + ZCOMP = ZCOMP - PMEDIAP + ELSE + IPE=IPE+1 + IF (IPE <= KRESTM .OR. KRESTM == 0) THEN + ICOMP = ICOMP + KMEDIAP + ELSE + ICOMP = ICOMP + (KMEDIAP-1) + ENDIF + ENDIF + ENDDO + KPROCAGP(JA)=ICOMP + ITOT = IREST + IGL = ILAST+1 + DO JGL=IGL,KDGL + ILAST = JGL + IF(ITOT+KLOENG(JGL) < ICOMP) THEN + ITOT = ITOT+KLOENG(JGL) + ELSEIF(ITOT+KLOENG(JGL) == ICOMP) THEN + IREST = 0 + KLAST(JA) = JGL + KINDIC(JA) = 0 + EXIT + ELSE + IREST = KLOENG(JGL) -(ICOMP-ITOT) + KLAST(JA) = JGL + KINDIC(JA) = JGL + EXIT + ENDIF + ENDDO + ENDDO + IF( LDWEIGHTED_DISTR )THEN + IF( KLAST(KPROCA) /= KDGL )THEN + DO JA=1,KPROCA + IF( MYPROC == 1 )THEN + WRITE(0,'("SUMPLATBEQ_MOD: JA=",I3," KLAST=",I3," KINDIC=",I3)')& + &JA,KLAST(JA),KINDIC(JA) + ENDIF + ENDDO + WRITE(0,'("SUMPLATBEQ: LWEIGHTED_DISTR=T FAILED TO PARTITION GRID, REVERTING TO ",& + & " LWEIGHTED_DISTR=F PARTITIONING")') + LDWEIGHTED_DISTR=.FALSE. + GOTO 100 + ENDIF + ENDIF + IF( SUM(KPROCAGP(:)) /= SUM(KLOENG(KDGSA:KDGL)) )THEN + IF( MYPROC == 1 )THEN + WRITE(0,'("SUM(KPROCAGP(:))=",I12)')SUM(KPROCAGP(:)) + WRITE(0,'("SUM(KLOENG(:))=",I12)')SUM(KLOENG(KDGSA:KDGL)) + ENDIF + CALL ABORT_TRANS ('SUMPLATBEQ: PROBLEM IN PARTITIONING ') + ENDIF + +ELSE + + IF( LDWEIGHTED_DISTR )THEN + CALL ABORT_TRANS ('SUMPLATBEQ: LSPLIT=F NOT SUPPORTED FOR WEIGHTED DISTRIBUTION ') + ENDIF + + KINDIC(:) = 0 + LLDONE=.FALSE. + IMEDIAP=KMEDIAP + IF( MYPROC == 1 )THEN + WRITE(0,'("SUMPLATBEQ: IMEDIAP=",I6)')IMEDIAP + ENDIF + DO WHILE(.NOT.LLDONE) +! loop until a satisfactory distribution can be found + IA=1 + IMAXI=IMEDIAP*N_REGIONS(IA) + DO JGL=1,KDGL + KLAST(IA)=JGL + IMAXI=IMAXI-KLOENG(JGL) + IF( IA == KPROCA .AND. JGL == KDGL )THEN + IF( MYPROC == 1 )THEN + WRITE(0,'("SUMPLATBEQ: EXIT 1")') + ENDIF + EXIT + ENDIF + IF( IA == KPROCA .AND. JGL < KDGL )THEN + IF( MYPROC == 1 )THEN + WRITE(0,'("SUMPLATBEQ: EXIT 2")') + ENDIF + KLAST(KPROCA)=KDGL + EXIT + ENDIF + IF( IA < KPROCA .AND. JGL == KDGL )THEN + DO JA=KPROCA,IA+1,-1 + KLAST(JA)=KDGL+JA-KPROCA + ENDDO + DO JA=KPROCA,2,-1 + IF( KLAST(JA) <= KLAST(JA-1) )THEN + KLAST(JA-1)=KLAST(JA)-1 + ENDIF + ENDDO + IF( MYPROC == 1 )THEN + WRITE(0,'("SUMPLATBEQ: EXIT 3")') + ENDIF + EXIT + ENDIF + IF( IMAXI <= 0 )THEN + IA=IA+1 + IMAXI=IMAXI+IMEDIAP*N_REGIONS(IA) + ENDIF + ENDDO + IF( KPROCA > 1 .AND. KLAST(KPROCA) == KLAST(KPROCA-1) )THEN + IMEDIAP=IMEDIAP-1 + IF( MYPROC == 1 )THEN + WRITE(0,'("SUMPLATBEQ: REDUCING IMEDIAP=",I6)')IMEDIAP + ENDIF + IF( IMEDIAP <= 0 )THEN + CALL ABORT_TRANS ('SUMPLATBEQ: PROBLEM PARTITIONING WITH LSPLIT=F, IMEDIAP <= 0') + ENDIF + ELSE + LLDONE=.TRUE. + ENDIF + ENDDO +ENDIF + +END SUBROUTINE SUMPLATBEQ +END MODULE SUMPLATBEQ_MOD diff --git a/src/trans/gpu/internal/sumplatf_mod.F90 b/src/trans/gpu/internal/sumplatf_mod.F90 new file mode 100755 index 00000000..7a5545fc --- /dev/null +++ b/src/trans/gpu/internal/sumplatf_mod.F90 @@ -0,0 +1,150 @@ +! (C) Copyright 1995- ECMWF. +! (C) Copyright 1995- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUMPLATF_MOD +CONTAINS +SUBROUTINE SUMPLATF(KDGL,KPROCA,KMYSETA,& + &KULTPP,KPROCL,KPTRLS) + +!**** *SUMPLATF * - Initialize fourier space distibution in N-S direction + +! Purpose. +! -------- + + +!** Interface. +! ---------- +! *CALL* *SUMPLATF * + +! Explicit arguments - input : +! -------------------- +! KDGL -last latitude +! KPROCA -number of processors in A direction +! KMYSETA -process number in A direction + +! Explicit arguments - output: +! -------------------- + +! KULTPP -number of latitudes in process +! (in Fourier space) +! KPROCL -process responsible for latitude +! (in Fourier space) +! KPTRLS -pointer to first global latitude +! of process (in Fourier space) + +! Implicit arguments : +! -------------------- + + +! Method. +! ------- +! See documentation + +! Externals. SUMPLATB and SUEMPLATB. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! MPP Group *ECMWF* + +! Modifications. +! -------------- +! Original : 95-10-01 +! David Dent:97-06-02 parameters KFRSTLAT etc added +! JF. Estrade:97-11-13 Adaptation to ALADIN case +! J.Boutahar: 98-07-06 phasing with CY19 +! Modified 98-08-10 by K. YESSAD: removal of LRPOLE option + cleanings +! (correct computation of extrapolar latitudes for KPROCL). +! Modified 98-12-07 by K. YESSAD and C. FISCHER: cleaning. +! - merge old sumplat.F and suemplat.F +! - gather 'lelam' code and 'not lelam' code. +! - clean (useless duplication of variables, non doctor features). +! - remodularise according to lelam/not lelam +! -> lelam features in new routine suemplatb.F, +! not lelam features in new routine sumplatb.F +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM + +USE TPM_GEOMETRY ,ONLY : G + +USE SUMPLATB_MOD ,ONLY : SUMPLATB +! + +IMPLICIT NONE + +! * DUMMY: +INTEGER(KIND=JPIM),INTENT(IN) :: KDGL +INTEGER(KIND=JPIM),INTENT(IN) :: KPROCA +INTEGER(KIND=JPIM),INTENT(IN) :: KMYSETA +INTEGER(KIND=JPIM),INTENT(OUT) :: KULTPP(:) +INTEGER(KIND=JPIM),INTENT(OUT) :: KPROCL(:) +INTEGER(KIND=JPIM),INTENT(OUT) :: KPTRLS(:) + +! * LOCAL: +INTEGER(KIND=JPIM) :: INDIC(KPROCA),ILAST(KPROCA) + +! LOCAL INTEGER SCALARS +INTEGER(KIND=JPIM) :: IA, ILAT, ISTART, IMEDIAP,IRESTM, JA, JLTLOC + +LOGICAL :: LLSPLIT,LLFOURIER + +! ----------------------------------------------------------------- + +!* 1. CODE DEPENDING ON 'LELAM': COMPUTATION OF +! KMEDIAP, KRESTM, INDIC, ILAST. +! ----------------------------------------- + +LLSPLIT = .FALSE. +LLFOURIER = .TRUE. + +CALL SUMPLATB(1,KDGL,KPROCA,G%NLOEN,LLSPLIT,LLFOURIER,& + &IMEDIAP,IRESTM,INDIC,ILAST) + +! ----------------------------------------------------------------- + +!* 2. CODE NOT DEPENDING ON 'LELAM': +! ------------------------------ + + + +! * Definitions related to distribution of latitudes along sets +! ------------ in fourier-space ----------------------------- +ISTART = 0 +KULTPP(1) = ILAST(1) +DO JA=1,KPROCA + IF(JA > 1) THEN + IF(ILAST(JA) /= 0) THEN + KULTPP(JA) = ILAST(JA)-ILAST(JA-1) + ELSE + KULTPP(JA) = 0 + ENDIF + ENDIF + DO JLTLOC=1,KULTPP(JA) + ILAT = ISTART + JLTLOC + KPROCL(ILAT) = JA + ENDDO + ISTART = ISTART + KULTPP(JA) +ENDDO + +! * Computes KPTRLS. + +IA = KPROCL(1) +KPTRLS(IA) = 1 +DO JA=IA+1,KPROCA + KPTRLS(JA) = KPTRLS(JA-1) + KULTPP(JA-1) +ENDDO + +END SUBROUTINE SUMPLATF +END MODULE SUMPLATF_MOD diff --git a/src/trans/gpu/internal/supol_mod.F90 b/src/trans/gpu/internal/supol_mod.F90 new file mode 100755 index 00000000..327ec60e --- /dev/null +++ b/src/trans/gpu/internal/supol_mod.F90 @@ -0,0 +1,172 @@ +! (C) Copyright 1987- ECMWF. +! (C) Copyright 1987- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUPOL_MOD +CONTAINS +SUBROUTINE SUPOL(KNSMAX,PDDMU,PFN,PDDPOL) + +!**** *SUPOL * - Routine to compute the Legendre polynomials + +! Purpose. +! -------- +! For a given value of mu, computes the Legendre polynomials. + +!** Interface. +! ---------- +! *CALL* *SUPOL(...) + +! Explicit arguments : +! -------------------- +! KNSMAX : Truncation (triangular) [in] +! PDDMU : Abscissa at which the polynomials are computed (mu) [in] +! PFN : Fourier coefficients of series expansion +! for the ordinary Legendre polynomials [in] +! PDDPOL : Polynomials (the first index is m and the second n) [out] + +! Implicit arguments : None +! -------------------- + +! Method. +! ------- +! See documentation about spectral transforms +! (doc (IDTS) by K. Yessad, appendix 3, or doc (NTA30) by M. Rochas) + +! Externals. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! Mats Hamrud and Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 87-10-15 +! K. YESSAD (MAY 1998): modification to avoid underflow. +! M.Hamrud 01-Oct-2003 CY28 Cleaning +! R. El Khatib 11-Apr-2007 Emulation of vectorized quadruple precision +! on NEC +! K. YESSAD (NOV 2008): make consistent arp/SUPOLA and tfl/SUPOL. +! Nils Wedi + Mats Hamrud, 2009-02-05 revised following Swarztrauber, 2002 +! R. El Khatib 30-Apr-2013 Open-MP parallelization +! F. Vana 05-Mar-2015 Support for single precision +! ------------------------------------------------------------------ + +USE EC_PARKIND ,ONLY : JPRD, JPIM +USE TPM_POL ,ONLY : DDI, DDA, DDH, DDE, DDC, DDD + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(IN) :: KNSMAX +REAL(KIND=JPRD) ,INTENT(IN) :: PDDMU +REAL(KIND=JPRD) ,INTENT(IN) :: PFN(0:KNSMAX,0:KNSMAX) + +REAL(KIND=JPRD) ,INTENT(OUT) :: PDDPOL(0:KNSMAX,0:KNSMAX) + +REAL(KIND=JPRD) :: ZDLX,ZDLX1,ZDLSITA,ZDL1SITA,ZDLS,ZDLK,ZDLLDN + +INTEGER(KIND=JPIM) :: JM, JN, JK +REAL(KIND=JPRD) :: Z + +! ------------------------------------------------------------------ + +!* 1. First two columns. +! ------------------ + +ZDLX=PDDMU +ZDLX1=ACOS(ZDLX) +ZDLSITA=SQRT(1.0_JPRD-ZDLX*ZDLX) + +PDDPOL(0,0)=1._JPRD +ZDLLDN = 0.0_JPRD + +! IF WE ARE LESS THAN 1Meter FROM THE POLE, +IF(ABS(REAL(ZDLSITA,KIND(Z))) <= SQRT(EPSILON(Z)))THEN + ZDLX=1._JPRD + ZDLSITA=0._JPRD + ZDL1SITA=0._JPRD +ELSE + ZDL1SITA=1.0_JPRD/ZDLSITA +ENDIF + +!* ordinary Legendre polynomials from series expansion +! --------------------------------------------------- + +! even N +!$OMP PARALLEL DO PRIVATE(JN,ZDLK,ZDLLDN,JK) +DO JN=2,KNSMAX,2 + ZDLK = 0.5_JPRD*PFN(JN,0) + ZDLLDN = 0.0_JPRD + ! represented by only even k + DO JK=2,JN,2 + ! normalised ordinary Legendre polynomial == \overbar{P_n}^0 + ZDLK = ZDLK + PFN(JN,JK)*COS(DDI(JK)*ZDLX1) + ! normalised associated Legendre polynomial == \overbar{P_n}^1 + ZDLLDN = ZDLLDN + DDA(JN)*PFN(JN,JK)*DDI(JK)*SIN(DDI(JK)*ZDLX1) + ENDDO + PDDPOL(0,JN) = ZDLK + PDDPOL(1,JN) = ZDLLDN +ENDDO +!$OMP END PARALLEL DO +! odd N +!$OMP PARALLEL DO PRIVATE(JN,ZDLK,ZDLLDN,JK) +DO JN=1,KNSMAX,2 + ZDLK = 0.0_JPRD + ZDLLDN = 0.0_JPRD + ! represented by only odd k + DO JK=1,JN,2 + ! normalised ordinary Legendre polynomial == \overbar{P_n}^0 + ZDLK = ZDLK + PFN(JN,JK)*COS(DDI(JK)*ZDLX1) + ! normalised associated Legendre polynomial == \overbar{P_n}^1 + ZDLLDN = ZDLLDN + DDA(JN)*PFN(JN,JK)*DDI(JK)*SIN(DDI(JK)*ZDLX1) + ENDDO + PDDPOL(0,JN) = ZDLK + PDDPOL(1,JN) = ZDLLDN +ENDDO +!$OMP END PARALLEL DO + +! ------------------------------------------------------------------ + +!* 2. Diagonal (the terms 0,0 and 1,1 have already been computed) +! Belousov, equation (23) +! ----------------------------------------------------------- + +ZDLS=ZDL1SITA*TINY(ZDLS) + +#ifdef VPP +!OCL SCALAR +#endif +DO JN=2,KNSMAX + PDDPOL(JN,JN)=PDDPOL(JN-1,JN-1)*ZDLSITA*DDH(JN) + IF ( ABS(PDDPOL(JN,JN)) < ZDLS ) PDDPOL(JN,JN)=0.0_JPRD +ENDDO + +! ------------------------------------------------------------------ + +!* 3. General recurrence (Belousov, equation 17) +! ----------------------------------------- + +DO JN=3,KNSMAX +!DIR$ IVDEP +!OCL NOVREC + DO JM=2,JN-1 + PDDPOL(JM,JN)=DDC(JM,JN)*PDDPOL(JM-2,JN-2)& + &-DDD(JM,JN)*PDDPOL(JM-2,JN-1)*ZDLX & + &+DDE(JM,JN)*PDDPOL(JM ,JN-1)*ZDLX + ENDDO +ENDDO + +! ------------------------------------------------------------------ + +END SUBROUTINE SUPOL +END MODULE SUPOL_MOD diff --git a/src/trans/gpu/internal/supolf_mod.F90 b/src/trans/gpu/internal/supolf_mod.F90 new file mode 100755 index 00000000..eb3e25b4 --- /dev/null +++ b/src/trans/gpu/internal/supolf_mod.F90 @@ -0,0 +1,284 @@ +! (C) Copyright 1987- ECMWF. +! (C) Copyright 1987- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUPOLF_MOD +CONTAINS +SUBROUTINE SUPOLF(KM,KNSMAX,DDMU,DDPOL,KCHEAP) + +!**** *SUPOL * - Routine to compute the Legendre polynomials + +! Purpose. +! -------- +! For a given value of mu and M, computes the Legendre +! polynomials upto KNSMAX + +!** Interface. +! ---------- +! *CALL* *SUPOLF(KM,KNSMAX,DDMU,DDPOL,KCHEAP) + +! Explicit arguments : +! -------------------- +! KM : zonal wavenumber M +! KNSMAX : Truncation (triangular) +! DDMU : Abscissa at which the polynomials are computed (mu) +! DDPOL : Polynomials (the first index is m and the second n) +! KCHEAP : odd/even saving switch + + +! Implicit arguments : None +! -------------------- + +! Method. +! ------- +! See documentation + +! Externals. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! Nils Wedi + George Mozdzynski + Mats Hamrud + +! Modifications. +! -------------- +! Original : 87-10-15 +! K. YESSAD (MAY 1998): modification to avoid underflow. +! R. El Khatib 11-Apr-2007 Emulation of vectorized quadruple precision +! on NEC +! F. Vana 05-Mar-2015 Support for single precision +! ------------------------------------------------------------------ + +USE EC_PARKIND ,ONLY : JPRD, JPIM + +USE TPM_POL ,ONLY : DFI, DFB, DFG, DFA, DFF + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(IN) :: KM +INTEGER(KIND=JPIM),INTENT(IN) :: KNSMAX +REAL(KIND=JPRD) ,INTENT(IN) :: DDMU +REAL(KIND=JPRD) ,INTENT(OUT) :: DDPOL(0:KNSMAX) + +INTEGER(KIND=JPIM),INTENT(IN), OPTIONAL :: KCHEAP + +REAL(KIND=JPRD) :: DLX,DLX1,DLSITA,DLSITA2,DL1SITA,DLK,DL1, DLKM1, DLKM2 + +INTEGER(KIND=JPIM), PARAMETER :: JPKD=KIND(DLX) + +INTEGER(KIND=JPIM) :: JN, KKL, ICHEAP, IC, IEND +REAL(KIND=JPRD) :: DCL, DDL + +REAL(KIND=JPRD) :: ZFAC, ZLSITA, ZFAC0, ZFAC1, ZMULT, ZEPS + +INTEGER(KIND=JPIM) :: JCORR, ICORR3, ICORR(KNSMAX) +REAL(KIND=JPRD) :: ZSCALE, ZISCALE + +DCL(KKL)=SQRT((REAL(KKL-KM+1,JPKD)*REAL(KKL-KM+2,JPKD)* & + & REAL(KKL+KM+1,JPKD)*REAL(KKL+KM+2,JPKD))/(REAL(2*KKL+1,JPKD)*REAL(2*KKL+3,JPKD)*& + & REAL(2*KKL+3,JPKD)*REAL(2*KKL+5,JPKD))) +DDL(KKL)=(2.0_JPKD*REAL(KKL,JPKD)*REAL(KKL+1,JPKD)-2.0_JPKD*REAL(KM**2,JPKD)-1.0_JPKD)/ & + & (REAL(2*KKL-1,JPKD)*REAL(2*KKL+3,JPKD)) + +! ------------------------------------------------------------------ + +!* 1. First two columns. +! ------------------ + +ZEPS = EPSILON(ZSCALE) +ICORR3=0 + +ICHEAP=1 +IF( PRESENT(KCHEAP) ) THEN + ICHEAP = KCHEAP +ENDIF + +DLX=DDMU +DLX1=ACOS(DLX) +DLSITA2=1.0_JPRD-DLX*DLX +DLSITA=SQRT(DLSITA2) + +!* ordinary Legendre polynomials from series expansion +! --------------------------------------------------- + +! this is supol_fast just using single KM +IF( ABS(REAL(DLSITA,JPRD)) <= ZEPS ) THEN + DLX=1._JPRD + DLSITA=0._JPRD + DL1SITA=0._JPRD + DLSITA2=0._JPRD +ELSE + DL1SITA=1.0_JPRD/DLSITA +ENDIF + +DLKM2=1._JPRD +DLKM1=DLX + +IF( KM == 0 ) THEN + DDPOL(0)=DLKM2 + DDPOL(1)=DLKM1*DFB(1)/DFA(1) + DO JN=2,KNSMAX + DLK=DFF(JN)*DLX*DLKM1-DFG(JN)*DLKM2 + DL1=DFI(JN)*(DLKM1-DLX*DLK)*DL1SITA + DDPOL(JN)=DLK*DFB(JN)/DFA(JN) + DLKM2=DLKM1 + DLKM1=DLK + ENDDO +ELSEIF( KM == 1 ) THEN + DDPOL(0)=0 + DDPOL(1)=DLSITA*DFB(1) + DO JN=2,KNSMAX + DLK=DFF(JN)*DLX*DLKM1-DFG(JN)*DLKM2 + DL1=DFI(JN)*(DLKM1-DLX*DLK)*DL1SITA + DDPOL(JN)=DL1*DFB(JN) + DLKM2=DLKM1 + DLKM1=DLK + ENDDO +ELSE + +! ------------------------------------------------------------------ +!* KM >= 2 +! ------------------------------------------------------------------ + +! ZSCALE=1._JPRD/ZEPS + ! Maintaining the consistency with the CY41R1 reference + ZSCALE=1.0E+100_JPRD + ZISCALE=1.0E-100_JPRD + ! General case + !ZSCALE = 10._JPRD**( MAXEXPONENT(ZSCALE)/10) + !ZISCALE = 10._JPRD**(-MAXEXPONENT(ZSCALE)/10) + + IEND=KM/2 + ZLSITA=1._JPRD +! WRITE(*,*) 'SUPOLF: DLSITA2=',DLSITA2,' DDMU=',DDMU,' DLX=',DLX + DO JN=1,IEND + ZLSITA=ZLSITA*DLSITA2 + IF( ABS(ZLSITA) < ZISCALE ) THEN + ZLSITA=ZLSITA*ZSCALE + ICORR3=ICORR3+1 + ENDIF + ENDDO + IF( MOD(KM,2) == 1 ) ZLSITA=ZLSITA*DLSITA +! WRITE(*,*) 'SUPOLF: ZSCALE=',ZSCALE,' ICORR3=',ICORR3,' KM=',KM,' ZLSITA=',ZLSITA + + ZFAC0=1._JPRD + ZFAC=1._JPRD + DO JN=1,KM-1 + ZFAC=ZFAC*SQRT(REAL(2*JN-1,JPRD)) + ZFAC=ZFAC/SQRT(REAL(2*JN,JPRD)) + ENDDO + ZFAC=ZFAC*SQRT(REAL(2*KM-1,JPRD)) +! WRITE(*,*) 'SUPOLF: ZSCALE=',ZSCALE,' ICORR3=',ICORR3,' ZFAC=',ZFAC + + ZFAC1=1._JPRD + DO IC=0,MIN(KNSMAX-KM,3) + + ! (2m+i)! + ZFAC0 = ZFAC0 * REAL(2*KM+IC,JPRD) + + SELECT CASE (IC) + CASE (0) + ZMULT=ZFAC + CASE (1) + ZFAC=ZFAC*REAL(2*KM+IC,JPRD) + ZMULT=ZFAC*DLX + CASE (2) + ZMULT=0.5_JPRD*ZFAC*(REAL(2*KM+3,JPRD)*DLX*DLX-1._JPRD) + CASE (3) + ZFAC=ZFAC*REAL(2*KM+IC,JPRD) + ZMULT=(1._JPRD/6._JPRD)*DLX*ZFAC*(REAL(2*KM+5,JPRD)*DLX*DLX-3._JPRD) + END SELECT + + DDPOL(KM+IC) = ZLSITA*ZMULT*SQRT(2._JPRD*(REAL(KM+IC,JPRD)+0.5_JPRD)*ZFAC1/ZFAC0) + + ZFAC1=ZFAC1*REAL(IC+1,JPRD) + + ENDDO + + ICORR(:)=ICORR3 + IF( ICHEAP == 2 ) THEN + ! symmetric case + DO JN=KM+2,KNSMAX-2,2 + + IF( ABS(DDPOL(JN-2)) > ZSCALE ) THEN + DDPOL(JN-2)=DDPOL(JN-2)/ZSCALE + DDPOL(JN)=DDPOL(JN)/ZSCALE + ICORR(JN-2:KNSMAX)=ICORR(JN-2:KNSMAX)-1 + ENDIF + + DDPOL(JN+2)=((DLX*DLX-DDL(JN))*DDPOL(JN)-DCL(JN-2)*DDPOL(JN-2))/DCL(JN) + ENDDO + + DO JN=KM,KNSMAX,2 + DO JCORR=1,ICORR(JN) + DDPOL(JN)=DDPOL(JN)/ZSCALE + IF( DDPOL(JN) < ZEPS ) THEN + DDPOL(JN) = ZEPS + ENDIF + ENDDO + ENDDO + + ELSEIF( ICHEAP == 3 ) THEN + ! antisymmetric case + DO JN=KM+3,KNSMAX-2,2 + + IF( ABS(DDPOL(JN-2)) > ZSCALE ) THEN + DDPOL(JN-2)=DDPOL(JN-2)/ZSCALE + DDPOL(JN)=DDPOL(JN)/ZSCALE + ICORR(JN-2:KNSMAX)=ICORR(JN-2:KNSMAX)-1 + ENDIF + + DDPOL(JN+2)=((DLX*DLX-DDL(JN))*DDPOL(JN)-DCL(JN-2)*DDPOL(JN-2))/DCL(JN) + ENDDO + + DO JN=KM+1,KNSMAX,2 + DO JCORR=1,ICORR(JN) + DDPOL(JN)=DDPOL(JN)/ZSCALE + IF( DDPOL(JN) < ZEPS ) THEN + DDPOL(JN) = ZEPS + ENDIF + ENDDO + ENDDO + + ELSE + DO JN=KM+2,KNSMAX-2 + + IF( ABS(DDPOL(JN-2)) > ZSCALE ) THEN + DDPOL(JN-2)=DDPOL(JN-2)/ZSCALE + DDPOL(JN-1)=DDPOL(JN-1)/ZSCALE + DDPOL(JN)=DDPOL(JN)/ZSCALE + DDPOL(JN+1)=DDPOL(JN+1)/ZSCALE + ICORR(JN-2:KNSMAX)=ICORR(JN-2:KNSMAX)-1 + ENDIF + + DDPOL(JN+2)=((DLX*DLX-DDL(JN))*DDPOL(JN)-DCL(JN-2)*DDPOL(JN-2))/DCL(JN) + + ENDDO + + DO JN=KM,KNSMAX + DO JCORR=1,ICORR(JN) + DDPOL(JN)=DDPOL(JN)/ZSCALE + IF( DDPOL(JN) < ZEPS ) THEN + DDPOL(JN) = ZEPS + ENDIF + ENDDO + ENDDO + + ENDIF + +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE SUPOLF +END MODULE SUPOLF_MOD diff --git a/src/trans/gpu/internal/sustaonl_mod.F90 b/src/trans/gpu/internal/sustaonl_mod.F90 new file mode 100755 index 00000000..9b01daee --- /dev/null +++ b/src/trans/gpu/internal/sustaonl_mod.F90 @@ -0,0 +1,457 @@ +! (C) Copyright 1995- ECMWF. +! (C) Copyright 1995- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUSTAONL_MOD +CONTAINS +SUBROUTINE SUSTAONL(KMEDIAP,KRESTM,LDWEIGHTED_DISTR,PWEIGHT,PMEDIAP,KPROCAGP) + +!**** *SUSTAONL * - Routine to initialize parallel environment + +! Purpose. +! -------- +! Initialize D%NSTA and D%NONL. +! Calculation of distribution of grid points to processors : +! Splitting of grid in B direction + +!** Interface. +! ---------- +! *CALL* *SUSTAONL * + +! Explicit arguments : +! -------------------- +! KMEDIAP - mean number of grid points per PE +! KRESTM - number of PEs with one extra point +! LDWEIGHTED_DISTR -true if weighted distribution +! PWEIGHT -weight per grid-point if weighted distribution +! PMEDIAP -mean weight per PE if weighted distribution +! KPROCAGP -number of grid points per A set + +! Implicit arguments : +! -------------------- + + +! Method. +! ------- +! See documentation + +! Externals. NONE. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! MPP Group *ECMWF* + +! Modifications. +! -------------- +! Original : 95-10-01 +! Modified 98-08-10 by K. YESSAD: removal of LRPOLE option. +! - removal of LRPOLE in YOMCT0. +! - removal of code under LRPOLE. +! Modified 98-12-04 C. Fischer: merge with SUESTAONL (Aladin) +! R. El Khatib 05-Apr-2007 Enable back vectorization on NEC +! R. El Khatib 30-Apr-2013 Optimization +! R. El Khatib 26-Apr-2018 vectorization +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRB, JPRBT, JPRD +USE MPL_MODULE ,ONLY : MPL_ALLGATHERV, MPL_RECV, MPL_SEND + +USE TPM_GEN ,ONLY : NOUT, NPRINTLEV +USE TPM_DIM ,ONLY : R +USE TPM_GEOMETRY ,ONLY : G +USE TPM_DISTR ,ONLY : D, LEQ_REGIONS, MTAGPART, NPRCIDS, MYPROC, NPROC + +USE SET2PE_MOD ,ONLY : SET2PE +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE EQ_REGIONS_MOD ,ONLY : MY_REGION_NS, MY_REGION_EW, & + & N_REGIONS, N_REGIONS_EW, N_REGIONS_NS +! + +IMPLICIT NONE + +! DUMMY +INTEGER(KIND=JPIM),INTENT(IN) :: KMEDIAP +INTEGER(KIND=JPIM),INTENT(IN) :: KRESTM +REAL(KIND=JPRBT),INTENT(IN) :: PWEIGHT(:) +LOGICAL,INTENT(IN) :: LDWEIGHTED_DISTR +REAL(KIND=JPRBT),INTENT(IN) :: PMEDIAP +INTEGER(KIND=JPIM),INTENT(IN) :: KPROCAGP(:) + +! LOCAL + +INTEGER(KIND=JPIM) :: IXPTLAT(R%NDGL), ILSTPTLAT(R%NDGL),ISENDREQ(NPROC) +INTEGER(KIND=JPIM) :: ICHK(R%NDLON,R%NDGL), ICOMBUF(R%NDGL*N_REGIONS_EW*2) +INTEGER(KIND=JPIM) :: I1, I2, IBUFLEN, IDGLG, IDWIDE,& + &IGL, IGL1, IGL2, IGLOFF, IGPTA, & + &IGPTPRSETS, IGPTS, IGPTSP, ILEN, ILRECV, & + &ILSEND, INPLAT, INXLAT, IPOS, & + &IPROCB, IPTSRE, IRECV, IPE, & + &IREST, ISEND, ITAG, JA, JB, JGL, JL, JNPTSRE, & + &ILAT, ILON, ILOEN +INTEGER(KIND=JPIM),ALLOCATABLE :: ICOMBUFG(:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZWEIGHT(:,:) +INTEGER(KIND=JPIM) :: JJ, ILENG(NPROC), IOFF(NPROC) +REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2 + +LOGICAL :: LLABORT +LOGICAL :: LLP1,LLP2 + +REAL(KIND=JPRBT) :: ZCOMP,ZPI,ZLON +REAL(KIND=JPRBT) :: ZDIVID(R%NDGL) +INTEGER(KIND=JPIM) :: ILATMD,ILATMD1 + +! ----------------------------------------------------------------- + +ZPI = 2.0_JPRBT*ASIN(1.0_JPRBT) + +IXPTLAT (:)=999999 +ILSTPTLAT(:)=999999 + +LLP1 = NPRINTLEV>0 +LLP2 = NPRINTLEV>1 + +IDWIDE = R%NDGL/2 +IBUFLEN = R%NDGL*N_REGIONS_EW*2 +IDGLG = R%NDGL + +I1 = MAX( 1,D%NFRSTLAT(MY_REGION_NS)-D%NFRSTLOFF) +I2 = MIN(IDGLG,D%NLSTLAT (MY_REGION_NS)-D%NFRSTLOFF) + +ILEN = D%NLSTLAT(MY_REGION_NS) - D%NFRSTLAT(MY_REGION_NS)+1 + +IGPTPRSETS = SUM(G%NLOEN(1:D%NFRSTLAT(MY_REGION_NS)-1)) + +IF (D%LSPLIT) THEN + IF( LEQ_REGIONS )THEN + IGPTA=0 + DO JA=1,MY_REGION_NS-1 + IGPTA = IGPTA + KPROCAGP(JA) + ENDDO + IGPTS = KPROCAGP(MY_REGION_NS) + ELSE + IF (MY_REGION_NS <= KRESTM.OR.KRESTM == 0) THEN + IGPTS = KMEDIAP + IGPTA = KMEDIAP*(MY_REGION_NS-1) + ELSE + IGPTS = KMEDIAP-1 + IGPTA = KMEDIAP*KRESTM+IGPTS*(MY_REGION_NS-1-KRESTM) + ENDIF + ENDIF +ELSE + IGPTA = IGPTPRSETS + IGPTS = SUM(G%NLOEN(D%NFRSTLAT(MY_REGION_NS):D%NLSTLAT(MY_REGION_NS))) +ENDIF + +IGPTSP = IGPTS/N_REGIONS(MY_REGION_NS) +IREST = IGPTS-N_REGIONS(MY_REGION_NS)*IGPTSP +IXPTLAT(1) = IGPTA-IGPTPRSETS+1 +ILSTPTLAT(1) = G%NLOEN(D%NFRSTLAT(MY_REGION_NS)) +INPLAT = G%NLOEN(D%NFRSTLAT(MY_REGION_NS))-IXPTLAT(1)+1 +DO JGL=2,ILEN + IXPTLAT(JGL) = 1 + ILSTPTLAT(JGL) = G%NLOEN(D%NFRSTLAT(MY_REGION_NS)+JGL-1) + INPLAT = INPLAT+G%NLOEN(D%NFRSTLAT(MY_REGION_NS)+JGL-1) +ENDDO +ILSTPTLAT(ILEN) = G%NLOEN(D%NLSTLAT(MY_REGION_NS))-INPLAT+IGPTS + +DO JB=1,N_REGIONS_EW + DO JGL=1,R%NDGL+N_REGIONS_NS-1 + D%NSTA(JGL,JB) = 0 + D%NONL(JGL,JB) = 0 + ENDDO +ENDDO + + +! grid point decomposition +! --------------------------------------- +IF( NPROC > 1 )THEN + DO JGL=1,ILEN + ZDIVID(JGL) = 360000.0_JPRBT/REAL(G%NLOEN(D%NFRSTLAT(MY_REGION_NS)+JGL-1),JPRBT) + ENDDO + IF( LDWEIGHTED_DISTR )THEN + ALLOCATE(ZWEIGHT(G%NLOEN(R%NDGL/2),R%NDGL)) + IGL=0 + DO JGL=1,R%NDGL + DO JL=1,G%NLOEN(JGL) + IGL=IGL+1 + ZWEIGHT(JL,JGL)=PWEIGHT(IGL) + ENDDO + ENDDO + ZCOMP=0 + IGPTS=0 + ENDIF + + DO JB=1,N_REGIONS(MY_REGION_NS) + + IF( .NOT.LDWEIGHTED_DISTR )THEN + + IF (JB <= IREST) THEN + IPTSRE = IGPTSP+1 + ELSE + IPTSRE = IGPTSP + ENDIF + DO JNPTSRE=1,IPTSRE + + ILATMD = 360000 !! 360*1000 + DO JGL=1,ILEN + IF (IXPTLAT(JGL) <= ILSTPTLAT(JGL)) THEN + ILATMD1 = NINT(REAL(IXPTLAT(JGL)-1,JPRB)*ZDIVID(JGL)) + IF(ILATMD1 < ILATMD) THEN + ILATMD = ILATMD1 + INXLAT = JGL + ENDIF + ENDIF + ENDDO + + IF (INXLAT >= I1 .AND. INXLAT <= I2) THEN + IGL=D%NPTRFLOFF+INXLAT + IF (D%NSTA(IGL,JB) == 0) THEN + D%NSTA(IGL,JB) = IXPTLAT(INXLAT) + ENDIF + D%NONL(IGL,JB) = D%NONL(IGL,JB)+1 + ENDIF + IXPTLAT(INXLAT) = IXPTLAT(INXLAT)+1 + ENDDO + + ELSE + + DO WHILE ( (JB < N_REGIONS(MY_REGION_NS) .AND. ZCOMP < PMEDIAP) & + & .OR. (JB == N_REGIONS(MY_REGION_NS) .AND. IGPTS < KPROCAGP(MY_REGION_NS)) ) + + IGPTS = IGPTS + 1 + ILATMD = 360000 !! 360*1000 + DO JGL=1,ILEN + IF (IXPTLAT(JGL) <= ILSTPTLAT(JGL)) THEN + ILATMD1 = NINT(REAL(IXPTLAT(JGL)-1,JPRB)*ZDIVID(JGL)) + IF(ILATMD1 < ILATMD) THEN + ILATMD = ILATMD1 + INXLAT = JGL + ENDIF + ENDIF + ENDDO + + IF (INXLAT >= I1 .AND. INXLAT <= I2) THEN + IGL=D%NPTRFLOFF+INXLAT + IF (D%NSTA(IGL,JB) == 0) THEN + D%NSTA(IGL,JB) = IXPTLAT(INXLAT) + ENDIF + D%NONL(IGL,JB) = D%NONL(IGL,JB)+1 + IF(IGL<1.OR.IGL>R%NDGL+N_REGIONS_NS-1)THEN + CALL ABORT_TRANS(' SUSTAONL: IGL<1.OR.IGL>R%NDGL+N_REGIONS_NS-1') + ENDIF + ILON=D%NSTA(IGL,JB)+D%NONL(IGL,JB)-1 + ILAT=D%NFRSTLAT(MY_REGION_NS)+INXLAT-1 + ILOEN=G%NLOEN(ILAT) + IF(ILON<1.OR.ILON>ILOEN)THEN + CALL ABORT_TRANS(' SUSTAONL: ILON<1.OR.ILON>ILOEN') + ENDIF + ZCOMP = ZCOMP + ZWEIGHT(ILON,ILAT) + ENDIF + IXPTLAT(INXLAT) = IXPTLAT(INXLAT)+1 + ENDDO + + ZCOMP = ZCOMP - PMEDIAP + + ENDIF + + ENDDO + + IF( LDWEIGHTED_DISTR )THEN + DEALLOCATE(ZWEIGHT) + ENDIF + + ! Exchange local partitioning info to produce global view + ! + + CALL GSTATS_BARRIER(795) + CALL GSTATS(814,0) + IF( LEQ_REGIONS )THEN + + ITAG = MTAGPART + IPOS = 0 + DO JGL=1,D%NLSTLAT(MY_REGION_NS)-D%NFRSTLAT(MY_REGION_NS)+1 + IPOS = IPOS+1 + ICOMBUF(IPOS) = D%NSTA(D%NPTRFLOFF+JGL,MY_REGION_EW) + IPOS = IPOS+1 + ICOMBUF(IPOS) = D%NONL(D%NPTRFLOFF+JGL,MY_REGION_EW) + ENDDO + IF( IPOS > IBUFLEN )THEN + CALL ABORT_TRANS(' SUSTAONL: SEND BUFFER TOO SMALL FOR GLOBAL INFO') + ENDIF + ILSEND = IPOS + + DO JA=1,N_REGIONS_NS + DO JB=1,N_REGIONS(JA) + CALL SET2PE(IRECV,JA,JB,0,0) + ILEN = (D%NLSTLAT(JA)-D%NFRSTLAT(JA)+1)*2 + ILENG(NPRCIDS(IRECV))=ILEN + ENDDO + ENDDO + IOFF(1)=0 + DO JJ=2,NPROC + IOFF(JJ)=IOFF(JJ-1)+ILENG(JJ-1) + ENDDO + ALLOCATE(ICOMBUFG(SUM(ILENG(:)))) + CALL MPL_ALLGATHERV(ICOMBUF(1:ILSEND),ICOMBUFG,ILENG,CDSTRING='SUSTAONL') + DO JA=1,N_REGIONS_NS + IGL1 = D%NFRSTLAT(JA) + IGL2 = D%NLSTLAT(JA) + DO JB=1,N_REGIONS(JA) + CALL SET2PE(IRECV,JA,JB,0,0) + IF(IRECV /= MYPROC) THEN + ILEN = (D%NLSTLAT(JA)-D%NFRSTLAT(JA)+1)*2 + IPOS = IOFF(NPRCIDS(IRECV)) + DO JGL=IGL1,IGL2 + IGL = D%NPTRFRSTLAT(JA)+JGL-IGL1 + IPOS = IPOS+1 + D%NSTA(IGL,JB) = ICOMBUFG(IPOS) + IPOS = IPOS+1 + D%NONL(IGL,JB) = ICOMBUFG(IPOS) + ENDDO + ENDIF + ENDDO + ENDDO + DEALLOCATE(ICOMBUFG) + + ELSE + + ITAG = MTAGPART + IPOS = 0 + DO JB=1,N_REGIONS(MY_REGION_NS) + DO JGL=1,D%NLSTLAT(MY_REGION_NS)-D%NFRSTLAT(MY_REGION_NS)+1 + IPOS = IPOS+1 + ICOMBUF(IPOS) = D%NSTA(D%NPTRFLOFF+JGL,JB) + IPOS = IPOS+1 + ICOMBUF(IPOS) = D%NONL(D%NPTRFLOFF+JGL,JB) + ENDDO + ENDDO + IF( IPOS > IBUFLEN )THEN + CALL ABORT_TRANS(' SUSTAONL: SEND BUFFER TOO SMALL FOR GLOBAL INFO') + ENDIF + ILSEND = IPOS + DO JA=1,N_REGIONS_NS + CALL SET2PE(ISEND,JA,MY_REGION_EW,0,0) + IF(ISEND /= MYPROC) THEN + CALL MPL_SEND(ICOMBUF(1:ILSEND),KDEST=NPRCIDS(ISEND),KTAG=ITAG, & + & CDSTRING='SUSTAONL:') + ENDIF + ENDDO + + DO JA=1,N_REGIONS_NS + CALL SET2PE(IRECV,JA,MY_REGION_EW,0,0) + IF(IRECV /= MYPROC) THEN + ILEN = (D%NLSTLAT(JA)-D%NFRSTLAT(JA)+1)*N_REGIONS(JA)*2 + CALL MPL_RECV(ICOMBUF(1:ILEN),KSOURCE=NPRCIDS(IRECV),KTAG=ITAG, & + & KOUNT=ILRECV,CDSTRING='SUSTAONL:') + IGL1 = D%NFRSTLAT(JA) + IGL2 = D%NLSTLAT(JA) + IPOS = 0 + DO JB=1,N_REGIONS(JA) + DO JGL=IGL1,IGL2 + IGL = D%NPTRFRSTLAT(JA)+JGL-IGL1 + IPOS = IPOS+1 + D%NSTA(IGL,JB) = ICOMBUF(IPOS) + IPOS = IPOS+1 + D%NONL(IGL,JB) = ICOMBUF(IPOS) + ENDDO + ENDDO + ENDIF + ENDDO + + ENDIF + CALL GSTATS(814,1) + CALL GSTATS_BARRIER2(795) +ELSE + DO JGL=1,R%NDGL + D%NSTA(JGL,1) = 1 + D%NONL(JGL,1) = G%NLOEN(JGL) + ENDDO +ENDIF + +! Confirm consistency of global partitioning, specifically testing for +! multiple assignments of same grid point and unassigned grid points + +LLABORT = .FALSE. +DO JGL=1,R%NDGL + DO JL=1,G%NLOEN(JGL) + ICHK(JL,JGL) = 1 + ENDDO +ENDDO +DO JA=1,N_REGIONS_NS + IGLOFF = D%NPTRFRSTLAT(JA) + DO JB=1,N_REGIONS(JA) + IGL1 = D%NFRSTLAT(JA) + IGL2 = D%NLSTLAT(JA) + DO JGL=IGL1,IGL2 + IGL = IGLOFF+JGL-IGL1 + DO JL=D%NSTA(IGL,JB),D%NSTA(IGL,JB)+D%NONL(IGL,JB)-1 + IF( ICHK(JL,JGL) /= 1 )THEN + WRITE(NOUT,'(" SUSTAONL : seta=",i4," setb=",i4,& + &" row=",I4," sta=",I4," INVALID GRID POINT")')& + &JA,JB,JGL,JL + WRITE(0,'(" SUSTAONL : seta=",i4," setb=",i4,& + &" ROW=",I4," sta=",I4," INVALID GRID POINT")')& + &JA,JB,JGL,JL + LLABORT = .TRUE. + ENDIF + ICHK(JL,JGL) = 2 + ENDDO + ENDDO + ENDDO +ENDDO +DO JGL=1,R%NDGL + DO JL=1,G%NLOEN(JGL) + IF( ICHK(JL,JGL) /= 2 )THEN + WRITE(NOUT,'(" SUSTAONL : row=",i4," sta=",i4,& + &" GRID POINT NOT ASSIGNED")') JGL,JL + LLABORT = .TRUE. + ENDIF + ENDDO +ENDDO +IF( LLABORT )THEN + WRITE(NOUT,'(" SUSTAONL : inconsistent partitioning")') + CALL ABORT_TRANS(' SUSTAONL: inconsistent partitioning') +ENDIF + + +IF (LLP1) THEN + WRITE(UNIT=NOUT,FMT='('' OUTPUT FROM ROUTINE SUSTAONL '')') + WRITE(UNIT=NOUT,FMT='('' '')') + WRITE(UNIT=NOUT,FMT='('' PARTITIONING INFORMATION '')') + WRITE(UNIT=NOUT,FMT='('' '')') + IPROCB = MIN(32,N_REGIONS_EW) + WRITE(UNIT=NOUT,FMT='(17X," SETB=",32(1X,I5))') (JB,JB=1,IPROCB) + DO JA=1,N_REGIONS_NS + IPROCB = MIN(32,N_REGIONS(JA)) + WRITE(UNIT=NOUT,FMT='('' '')') + IGLOFF = D%NPTRFRSTLAT(JA) + IGL1 = D%NFRSTLAT(JA) + IGL2 = D%NLSTLAT(JA) + DO JGL=IGL1,IGL2 + IGL=IGLOFF+JGL-IGL1 + WRITE(UNIT=NOUT,FMT='(" SETA=",I5," LAT=",I5," NSTA=",& + &32(1X,I5))') JA,JGL,(D%NSTA(IGL,JB),JB=1,IPROCB) + WRITE(UNIT=NOUT,FMT='(" SETA=",I5," LAT=",I5," D%NONL=",& + &32(1X,I5))') JA,JGL,(D%NONL(IGL,JB),JB=1,IPROCB) + ENDDO + WRITE(UNIT=NOUT,FMT='('' '')') + ENDDO + WRITE(UNIT=NOUT,FMT='('' '')') + WRITE(UNIT=NOUT,FMT='('' '')') +ENDIF + +! ------------------------------------------------------------------ + +END SUBROUTINE SUSTAONL +END MODULE SUSTAONL_MOD diff --git a/src/trans/gpu/internal/sutrle_mod.F90 b/src/trans/gpu/internal/sutrle_mod.F90 new file mode 100755 index 00000000..3ceefed1 --- /dev/null +++ b/src/trans/gpu/internal/sutrle_mod.F90 @@ -0,0 +1,364 @@ +! (C) Copyright 1995- ECMWF. +! (C) Copyright 1995- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUTRLE_MOD +CONTAINS +SUBROUTINE SUTRLE(PNM,KGL,KLOOP) + +!**** *sutrle * - transposition of Legendre polynomials during set-up + +! Purpose. +! -------- +! transposition of Legendre polynomials during set-up + +!** Interface. +! ---------- +! *call* *sutrle(pnm) + +! Explicit arguments : +! -------------------- + +! Implicit arguments : +! -------------------- + +! Method. +! ------- +! See documentation + +! Externals. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! MPP Group *ECMWF* + +! Modifications. +! -------------- +! Original : 95-10-01 +! P.Towers : 10-01-12 Corrected over allocation of ZSNDBUF (XT4 fix) +! G.Mozdzynski: March 2011 Support 2D (RW,RV) initialisation of legendre coeffs +! F. Vana 05-Mar-2015 Support for single precision +! ------------------------------------------------------------------ + + +USE EC_PARKIND ,ONLY : JPRD, JPIM +USE MPL_MODULE ,ONLY : MPL_ALLREDUCE, MPL_RECV, MPL_SEND, MPL_BARRIER, MPL_WAIT, & + & JP_NON_BLOCKING_STANDARD + +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D, MTAGLETR, NPRCIDS, NPRTRW, NPRTRV, & + & MYSETV, MYSETW, NPROC +USE TPM_FIELDS ,ONLY : F +USE SET2PE_MOD ,ONLY : SET2PE +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +! + +IMPLICIT NONE + +REAL(KIND=JPRD),INTENT(IN) :: PNM(:) +INTEGER(KIND=JPIM),INTENT(IN) :: KGL +INTEGER(KIND=JPIM),INTENT(IN) :: KLOOP + +! LOCAL + +REAL(KIND=JPRD), ALLOCATABLE :: ZSNDBUFV(:),ZRCVBUFV(:,:) +REAL(KIND=JPRD), ALLOCATABLE :: ZSNDBUFW(:,:),ZRCVBUFW(:,:) +INTEGER(KIND=JPIM) :: IM, IPOS, & + & IRECVSET, IRECV, ISEND, ISENDSET, ITAG,ISENDSIZE, IRECVSIZE, & + & J, JM, JMLOC, JN, JV, JROC ,IOFFT, IOFFG, IGL, ISREQ, IRREQ +INTEGER(KIND=JPIM) :: ISENDREQ(MAX(NPRTRW,NPRTRV)) +INTEGER(KIND=JPIM) :: IRECVREQ(MAX(NPRTRW,NPRTRV)) +INTEGER(KIND=JPIM) :: IGLVS(NPRTRV) +INTEGER(KIND=JPIM) :: IGLVR(NPRTRV) +INTEGER(KIND=JPIM) :: IPOSW(NPRTRW) + +! ------------------------------------------------------------------ + +!* 0. Some initializations. +! --------------------- + +ITAG = MTAGLETR+KLOOP + +! Perform barrier synchronisation to guarantee all processors have +! completed all previous communication + +IF( NPROC > 1 .AND. KLOOP ==1)THEN + CALL GSTATS(783,0) + CALL MPL_BARRIER(CDSTRING='SUTRLE:') + CALL GSTATS(783,1) +ENDIF + +! +! First do communications in NPRTRV direction +! + +!* Calculate send buffer size + +IF(KGL > 0) THEN + ISENDSIZE = R%NSPOLEG+1 +ELSE + ISENDSIZE=1 +ENDIF + +ALLOCATE (ZSNDBUFV(ISENDSIZE)) +ALLOCATE (ZRCVBUFV(R%NSPOLEG+1,NPRTRV)) + +!* copy data to be sent into zsndbufv + +ZSNDBUFV(1) = KGL +IF(KGL > 0) THEN + CALL GSTATS(1141,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(J) + DO J=1,R%NSPOLEG + ZSNDBUFV(J+1) = PNM(J) + ENDDO +!$OMP END PARALLEL DO + CALL GSTATS(1141,1) +ENDIF + +IRREQ=0 +DO JROC=1,NPRTRV-1 + IRECV = MYSETV+JROC + IF (IRECV > NPRTRV) IRECV = IRECV-NPRTRV + IRECVSET = IRECV + CALL SET2PE(IRECV,0,0,MYSETW,IRECVSET) + IRREQ = IRREQ+1 + CALL GSTATS(801,0) + CALL MPL_RECV(ZRCVBUFV(:,IRECVSET),KSOURCE=NPRCIDS(IRECV), & + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IRECVREQ(IRREQ),& + & KTAG=ITAG,CDSTRING='SUTRLE:') + CALL GSTATS(801,1) +ENDDO + +ISREQ = 0 +DO JROC=1,NPRTRV-1 + ISEND = MYSETV-JROC + IF (ISEND <= 0) ISEND = ISEND+NPRTRV + ISENDSET = ISEND + CALL SET2PE(ISEND,0,0,MYSETW,ISENDSET) + ISREQ = ISREQ+1 + CALL GSTATS(801,0) + CALL MPL_SEND(ZSNDBUFV(1:ISENDSIZE),KDEST=NPRCIDS(ISEND), & + &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(ISREQ),& + & KTAG=ITAG,CDSTRING='SUTRLE:') + CALL GSTATS(801,1) +ENDDO + +IF(ISREQ > 0) THEN + CALL GSTATS(801,0) + CALL MPL_WAIT(KREQUEST=ISENDREQ(1:ISREQ), & + & CDSTRING='SUTRLE: WAIT') + CALL GSTATS(801,1) +ENDIF + +IF(IRREQ > 0) THEN + CALL GSTATS(801,0) + CALL MPL_WAIT(KREQUEST=IRECVREQ(1:IRREQ), & + & CDSTRING='SUTRLE: WAIT') + CALL GSTATS(801,1) +ENDIF + +!* copy data from buffer to f%rpnm +CALL GSTATS(1141,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JROC,IRECV,IRECVSET,IGL,JMLOC,IM,IOFFT,IOFFG,JN) +DO JROC=1,NPRTRV-1 + IRECV = MYSETV+JROC + IF (IRECV > NPRTRV) IRECV = IRECV-NPRTRV + IRECVSET = IRECV + CALL SET2PE(IRECV,0,0,MYSETW,IRECVSET) + IGL = ZRCVBUFV(1,IRECVSET) + IGLVS(IRECVSET)=IGL + IF( IGL > 0 )THEN + DO JMLOC=1,D%NUMP + IM = D%MYMS(JMLOC) + IOFFT = D%NPMT(IM) + IOFFG = D%NPMG(IM) + DO JN=1,R%NTMAX-IM+2 + F%RPNM(IGL,IOFFT+JN) = ZRCVBUFV(1+IOFFG+JN,IRECVSET) + ENDDO + ENDDO + ENDIF +ENDDO +!$OMP END PARALLEL DO + +DEALLOCATE (ZSNDBUFV) + +!* copy data from pnm to rpnm + +IGLVS(MYSETV)=KGL +IF(KGL > 0) THEN + ZRCVBUFV(1,MYSETV)=KGL + ZRCVBUFV(2:R%NSPOLEG+1,MYSETV)=PNM(1:R%NSPOLEG) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JMLOC,IM,IOFFT,IOFFG,JN) + DO JMLOC=1,D%NUMP + IM = D%MYMS(JMLOC) + IOFFT = D%NPMT(IM) + IOFFG = D%NPMG(IM) + DO JN=1,R%NTMAX-IM+2 + F%RPNM(KGL,IOFFT+JN) = PNM(IOFFG+JN) + ENDDO + ENDDO +!$OMP END PARALLEL DO +ENDIF +CALL GSTATS(1141,1) + + +! +! Now do communications in the NPRTRW direction +! + +!* Calculate send buffer size + +ISENDSIZE=0 +DO JROC=1,NPRTRW-1 + ISEND = MYSETW-JROC + IF (ISEND <= 0) ISEND = ISEND+NPRTRW + ISENDSET = ISEND + CALL SET2PE(ISEND,0,0,ISENDSET,MYSETV) + IPOS = 0 + DO JM=0,R%NSMAX + IF (ISENDSET == D%NPROCM(JM) ) IPOS = IPOS + R%NTMAX-JM+2 + ENDDO + ISENDSIZE = MAX(IPOS,ISENDSIZE) +ENDDO +ISENDSIZE=ISENDSIZE*NPRTRV+NPRTRV +IRECVSIZE=ISENDSIZE +IF( NPROC > 1 )THEN + CALL GSTATS(801,0) + CALL MPL_ALLREDUCE(IRECVSIZE,'MAX',CDSTRING='SUTRLE:') + CALL GSTATS(801,1) +ENDIF + +ALLOCATE (ZSNDBUFW(ISENDSIZE,NPRTRW)) +ALLOCATE (ZRCVBUFW(IRECVSIZE,NPRTRW)) + +CALL GSTATS(1141,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JROC,ISEND,ISENDSET,IPOS,JV,IGL,JM,JN) +DO JROC=1,NPRTRW-1 + ISEND = MYSETW-JROC + IF (ISEND <= 0) ISEND = ISEND+NPRTRW + ISENDSET = ISEND + CALL SET2PE(ISEND,0,0,ISENDSET,MYSETV) +!* copy data to be sent into zsndbufw + IPOS=0 + DO JV=1,NPRTRV + IPOS=IPOS+1 + ZSNDBUFW(IPOS,ISENDSET) = IGLVS(JV) + ENDDO + DO JV=1,NPRTRV + IGL = IGLVS(JV) + IF( IGL > 0 )THEN + DO JM=0,R%NSMAX + IF (ISENDSET == D%NPROCM(JM) ) THEN + DO JN=1,R%NTMAX-JM+2 + IPOS = IPOS + 1 + ZSNDBUFW(IPOS,ISENDSET) = ZRCVBUFV(1+D%NPMG(JM)+JN,JV) + ENDDO + ENDIF + ENDDO + ENDIF + ENDDO + IPOSW(ISENDSET)=IPOS +ENDDO +!$OMP END PARALLEL DO +CALL GSTATS(1141,1) + +IRREQ = 0 +DO JROC=1,NPRTRW-1 + + IRECV = MYSETW+JROC + IF (IRECV > NPRTRW) IRECV = IRECV-NPRTRW + IRECVSET = IRECV + CALL SET2PE(IRECV,0,0,IRECVSET,MYSETV) +!* receive message (if not empty) + + IRREQ = IRREQ+1 + CALL GSTATS(801,0) + CALL MPL_RECV(ZRCVBUFW(:,IRECVSET),KSOURCE=NPRCIDS(IRECV), & + & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IRECVREQ(IRREQ),& + & KTAG=ITAG,CDSTRING='SUTRLE:') + CALL GSTATS(801,1) +ENDDO + +ISREQ = 0 +DO JROC=1,NPRTRW-1 + ISEND = MYSETW-JROC + IF (ISEND <= 0) ISEND = ISEND+NPRTRW + ISENDSET = ISEND + CALL SET2PE(ISEND,0,0,ISENDSET,MYSETV) + ISENDSIZE = IPOSW(ISENDSET) + ISREQ = ISREQ+1 + CALL GSTATS(801,0) + CALL MPL_SEND(ZSNDBUFW(1:ISENDSIZE,ISENDSET),KDEST=NPRCIDS(ISEND), & + & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(ISREQ),& + & KTAG=ITAG,CDSTRING='SUTRLE:') + CALL GSTATS(801,1) +ENDDO + + +IF(ISREQ > 0) THEN + CALL GSTATS(801,0) + CALL MPL_WAIT(KREQUEST=ISENDREQ(1:ISREQ), & + & CDSTRING='SUTRLE: WAIT') + CALL GSTATS(801,1) +ENDIF + +IF(IRREQ > 0) THEN + CALL GSTATS(801,0) + CALL MPL_WAIT(KREQUEST=IRECVREQ(1:IRREQ), & + & CDSTRING='SUTRLE: WAIT') + CALL GSTATS(801,1) +ENDIF + +CALL GSTATS(1141,0) +!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JROC,IRECV,IRECVSET,IPOS,IGLVR,JV,IGL,JMLOC,IM,IOFFT,JN) +DO JROC=1,NPRTRW-1 + IRECV = MYSETW+JROC + IF (IRECV > NPRTRW) IRECV = IRECV-NPRTRW + IRECVSET = IRECV + CALL SET2PE(IRECV,0,0,IRECVSET,MYSETV) +!* copy data from buffer to f%rpnm + IPOS=0 + DO JV=1,NPRTRV + IPOS=IPOS+1 + IGLVR(JV)=ZRCVBUFW(IPOS,IRECVSET) + ENDDO + DO JV=1,NPRTRV + IGL = IGLVR(JV) + IF( IGL > 0 )THEN + DO JMLOC=1,D%NUMP + IM = D%MYMS(JMLOC) + IOFFT = D%NPMT(IM) + DO JN=1,R%NTMAX-IM+2 + IPOS = IPOS + 1 + F%RPNM(IGL,IOFFT+JN) = ZRCVBUFW(IPOS,IRECVSET) + ENDDO + ENDDO + ENDIF + ENDDO +ENDDO +!$OMP END PARALLEL DO +CALL GSTATS(1141,1) + +DEALLOCATE (ZRCVBUFV) +DEALLOCATE (ZSNDBUFW) +DEALLOCATE (ZRCVBUFW) + +IF( NPROC > 1 .AND. KLOOP ==1)THEN + CALL GSTATS(783,0) + CALL MPL_BARRIER(CDSTRING='SUTRLE:') + CALL GSTATS(783,1) +ENDIF +END SUBROUTINE SUTRLE +END MODULE SUTRLE_MOD diff --git a/src/trans/gpu/internal/suwavedi_mod.F90 b/src/trans/gpu/internal/suwavedi_mod.F90 new file mode 100755 index 00000000..6995b2ee --- /dev/null +++ b/src/trans/gpu/internal/suwavedi_mod.F90 @@ -0,0 +1,186 @@ +! (C) Copyright 1996- ECMWF. +! (C) Copyright 1996- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SUWAVEDI_MOD +CONTAINS +SUBROUTINE SUWAVEDI(KSMAX,KTMAX,KPRTRW,KMYSETW,KASM0,KSPOLEGL,KPROCM,& + &KUMPP,KSPEC,KSPEC2,KSPEC2MX,KPOSSP,KMYMS,& + &KPTRMS,KALLMS,KDIM0G) + +!**** *SUWAVEDI * - Routine to initialize spectral wave distribution + +! Purpose. +! -------- +! Initialize arrays controlling spectral wave distribution + +!** Interface. +! ---------- +! *CALL* *SUWAVEDI * + +! Explicit arguments : +! -------------------- +! KSMAX - Spectral truncation limit (input) +! KTMAX - Overtruncation for KSMAX (input) +! KPRTRW - Number of processors in A-direction (input) +! KMYSETW - A-set for present processor (input) +! KASM0 - Offsets for spectral waves (output) +! KSPOLEGL - Local version of NSPOLEG (output) +! KPROCM - Where a certain spectral wave belongs (output) +! KUMPP - Number of spectral waves on this PE (output) +! KSPEC - Local version on NSPEC (output) +! KSPEC2 - Local version on NSPEC2 (output) +! KSPEC2MX - Maximum KSPEC2 across PEs (output) +! KPOSSP - Global spectral fields partitioning (output) +! KMYMS - This PEs spectral zonal wavenumbers (output) +! KPTRMS - Pointer to the first wave number of a given a-set (output) +! KALLMS - Wave numbers for all wave-set concatenated together +! to give all wave numbers in wave-set order (output) + +! Implicit arguments : NONE +! -------------------- + +! Method. +! ------- +! See documentation + +! Externals. NONE. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! MPP Group *ECMWF* + +! Modifications. +! -------------- +! Original : 96-01-10 +! L.Isaksen: 96-02-02 - Calculation of KSPEC2MX added +! K.YESSAD : 97-02-18 - Add KTMAX, bug correction for KSPOLEGL. +! ------------------------------------------------------------------ + +USE EC_PARKIND ,ONLY : JPIM + +IMPLICIT NONE + + +! DUMMY +INTEGER(KIND=JPIM),INTENT(IN) :: KSMAX +INTEGER(KIND=JPIM),INTENT(IN) :: KTMAX +INTEGER(KIND=JPIM),INTENT(IN) :: KPRTRW +INTEGER(KIND=JPIM),INTENT(IN) :: KMYSETW +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KSPEC +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KSPEC2 +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KSPEC2MX +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KSPOLEGL + +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KASM0(0:KSMAX) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KPROCM(0:KSMAX) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KUMPP(KPRTRW) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KMYMS(KSMAX+1) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KPOSSP(KPRTRW+1) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KPTRMS(KPRTRW) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KALLMS(KSMAX+1) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(OUT) :: KDIM0G(0:KSMAX) + +! LOCAL +INTEGER(KIND=JPIM) :: IK, IL, IND, IPOS, ISPEC2P, JA, JM,JMLOC,IM +INTEGER(KIND=JPIM) :: ISPOLEGL,ISPEC2MX,IASM0(0:KSMAX),IPROCM(0:KSMAX) +INTEGER(KIND=JPIM) :: IUMPP(KPRTRW),IMYMS(KSMAX+1),IPOSSP(KPRTRW+1) +INTEGER(KIND=JPIM) :: IPTRMS(KPRTRW),IALLMS(KSMAX+1),IDIM0G(0:KSMAX) +INTEGER(KIND=JPIM) :: ISPEC(KPRTRW),IC(KPRTRW) + + +! ----------------------------------------------------------------- + +!* 1. Initialize partitioning of wave numbers to PEs +! ---------------------------------------------- + +ISPEC(:) = 0 + +IUMPP(:) = 0 +IASM0(:) = -99 +ISPOLEGL = 0 + +IL = 1 +IND = 1 +IK = 0 +IPOS = 1 +DO JM=0,KSMAX + IK = IK + IND + IF (IK > KPRTRW) THEN + IK = KPRTRW + IND = -1 + ELSEIF (IK < 1) THEN + IK = 1 + IND = 1 + ENDIF + IPROCM(JM) = IK + ISPEC(IK) = ISPEC(IK)+KSMAX-JM+1 + IUMPP(IK) = IUMPP(IK)+1 + IF (IK == KMYSETW) THEN + ISPOLEGL = ISPOLEGL +KTMAX+1-JM+1 + IMYMS(IL) = JM + IASM0(JM) = IPOS + IPOS = IPOS+(KSMAX-JM+1)*2 + IL = IL+1 + ENDIF +ENDDO + +IPOSSP(1) = 1 +ISPEC2P = 2*ISPEC(1) +ISPEC2MX = ISPEC2P +IPTRMS(1) = 1 +DO JA=2,KPRTRW + IPOSSP(JA) = IPOSSP(JA-1)+ISPEC2P + ISPEC2P = 2*ISPEC(JA) + ISPEC2MX = MAX(ISPEC2MX,ISPEC2P) +! pointer to the first wave number of a given wave-set in NALLMS array + IPTRMS(JA) = IPTRMS(JA-1)+IUMPP(JA-1) +ENDDO +IPOSSP(KPRTRW+1) = IPOSSP(KPRTRW)+ISPEC2P + +! IALLMS : wave numbers for all wave-set concatenated together to give all +! wave numbers in wave-set order. +IC(:) = 0 +DO JM=0,KSMAX + IALLMS(IC(IPROCM(JM))+IPTRMS(IPROCM(JM))) = JM + IC(IPROCM(JM)) = IC(IPROCM(JM))+1 +ENDDO + +IPOS = 1 +DO JA=1,KPRTRW + DO JMLOC=1,IUMPP(JA) + IM = IALLMS(IPTRMS(JA)+JMLOC-1) + IDIM0G(IM) = IPOS + IPOS = IPOS+(KSMAX+1-IM)*2 + ENDDO +ENDDO + +IF(PRESENT(KSPEC)) KSPEC = ISPEC(KMYSETW) +IF(PRESENT(KSPEC2)) KSPEC2 = 2*ISPEC(KMYSETW) +IF(PRESENT(KSPEC2MX)) KSPEC2MX = ISPEC2MX +IF(PRESENT(KSPOLEGL)) KSPOLEGL = ISPOLEGL + +IF(PRESENT(KASM0)) KASM0(:) = IASM0(:) +IF(PRESENT(KPROCM)) KPROCM(:) = IPROCM(:) +IF(PRESENT(KUMPP)) KUMPP(:) = IUMPP(:) +IF(PRESENT(KMYMS)) KMYMS(:) = IMYMS(:) +IF(PRESENT(KPOSSP)) KPOSSP(:) = IPOSSP(:) +IF(PRESENT(KPTRMS)) KPTRMS(:) = IPTRMS(:) +IF(PRESENT(KALLMS)) KALLMS(:) = IALLMS(:) +IF(PRESENT(KDIM0G)) KDIM0G(:) = IDIM0G(:) + +END SUBROUTINE SUWAVEDI +END MODULE SUWAVEDI_MOD + + diff --git a/src/trans/gpu/internal/tpm_constants.F90 b/src/trans/gpu/internal/tpm_constants.F90 new file mode 100755 index 00000000..6f8ab2b7 --- /dev/null +++ b/src/trans/gpu/internal/tpm_constants.F90 @@ -0,0 +1,20 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_CONSTANTS +USE PARKIND_ECTRANS ,ONLY : JPRBT + +IMPLICIT NONE + +SAVE + +REAL(KIND=JPRBT) :: RA ! Radius of Earth + +END MODULE TPM_CONSTANTS diff --git a/src/trans/gpu/internal/tpm_ctl.F90 b/src/trans/gpu/internal/tpm_ctl.F90 new file mode 100755 index 00000000..7b967ee0 --- /dev/null +++ b/src/trans/gpu/internal/tpm_ctl.F90 @@ -0,0 +1,43 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_CTL + +USE PARKIND1 ,ONLY : JPIM +USE, INTRINSIC :: iso_c_binding, ONLY: C_PTR, C_NULL_PTR +USE SHAREDMEM_MOD ,ONLY : SHAREDMEM +IMPLICIT NONE + +SAVE + + +TYPE CTL_TYPE + +LOGICAL :: LREAD_LEGPOL = .FALSE. +LOGICAL :: LWRITE_LEGPOL = .FALSE. +CHARACTER(LEN=256) :: CLEGPOLFNAME='legpol_file' +CHARACTER(LEN=4) :: CIO_TYPE='file' +TYPE(SHAREDMEM) :: STORAGE + +END TYPE CTL_TYPE + + +TYPE(CTL_TYPE),ALLOCATABLE,TARGET :: CTL_RESOL(:) +TYPE(CTL_TYPE),POINTER :: C + + +END MODULE TPM_CTL + + + + + + + diff --git a/src/trans/gpu/internal/tpm_dim.F90 b/src/trans/gpu/internal/tpm_dim.F90 new file mode 100755 index 00000000..181e4bd7 --- /dev/null +++ b/src/trans/gpu/internal/tpm_dim.F90 @@ -0,0 +1,58 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_DIM + +! Module for dimensions. + +USE PARKIND1 ,ONLY : JPIM + +IMPLICIT NONE + +SAVE + +TYPE DIM_TYPE +! SPECTRAL SPACE DIMENSIONS + + INTEGER(KIND=JPIM) :: NSMAX ! Truncation order + INTEGER(KIND=JPIM) :: NTMAX ! Truncation order for tendencies + INTEGER(KIND=JPIM) :: NSPOLEG ! Number of Legandre polynomials + INTEGER(KIND=JPIM) :: NSPEC_G ! Number of complex spectral coefficients (global) + INTEGER(KIND=JPIM) :: NSPEC2_G ! 2*NSPEC_G + +! COLLOCATION GRID DIMENSIONS + + INTEGER(KIND=JPIM) :: NDGL ! Number of rows of latitudes + INTEGER(KIND=JPIM) :: NDLON ! Maximum number of longitude points (near equator) + INTEGER(KIND=JPIM) :: NDGNH ! Number of rows in northern hemisphere + +! Legendre transform dimensions + INTEGER(KIND=JPIM) :: NLEI1 ! R%NSMAX+4+MOD(R%NSMAX+4+1,2) + INTEGER(KIND=JPIM) :: NLEI3 ! R%NDGNH+MOD(R%NDGNH+2,2) + INTEGER(KIND=JPIM) :: NLED3 ! R%NTMAX+2+MOD(R%NTMAX+3,2) + INTEGER(KIND=JPIM) :: NLED4 ! R%NTMAX+3+MOD(R%NTMAX+4,2) + +! Width of E'-zone + INTEGER(KIND=JPIM) :: NNOEXTZL ! Longitude direction + INTEGER(KIND=JPIM) :: NNOEXTZG ! Latitude direction + +END TYPE DIM_TYPE + +TYPE(DIM_TYPE),ALLOCATABLE,TARGET :: DIM_RESOL(:) +TYPE(DIM_TYPE),POINTER :: R + +! flat copies of above +INTEGER(KIND=JPIM) :: R_NSMAX ! Truncation order +INTEGER(KIND=JPIM) :: R_NTMAX ! Truncation order for tendencies +INTEGER(KIND=JPIM) :: R_NDGNH ! Number of rows in northern hemisphere +INTEGER(KIND=JPIM) :: R_NDGL ! Number of rows of latitudes + +END MODULE TPM_DIM diff --git a/src/trans/gpu/internal/tpm_distr.F90 b/src/trans/gpu/internal/tpm_distr.F90 new file mode 100755 index 00000000..08812292 --- /dev/null +++ b/src/trans/gpu/internal/tpm_distr.F90 @@ -0,0 +1,196 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_DISTR + +! Module for distributed memory environment. + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + +IMPLICIT NONE + +SAVE + +!* Variables describing distributed memory parallelization + +INTEGER(KIND=JPIM) :: NPROC ! Number of processors (NPRGPNS*NPRGPEW) +INTEGER(KIND=JPIM) :: NPRGPNS ! No. of sets in N-S direction (grid-point space) +INTEGER(KIND=JPIM) :: NPRGPEW ! No. of sets in E-W direction (grid-point space) +INTEGER(KIND=JPIM) :: NPRTRW ! No. of sets in wave direction (spectral space) +INTEGER(KIND=JPIM) :: NPRTRV ! NPROC/NPRTRW +INTEGER(KIND=JPIM) :: NPRTRNS ! No. of sets in N-S direction (Fourier space) + ! (always equal to NPRTRW) +LOGICAL :: LEQ_REGIONS ! TRUE - Use new eq_regions partitioning + ! FALSE- Use old NPRGPNS x NPRGPEW partitioning +INTEGER(KIND=JPIM) :: MYPROC ! My processor number +INTEGER(KIND=JPIM) :: MYSETW ! My set number in wave direction (spectral space) +INTEGER(KIND=JPIM) :: MYSETV ! My set number in field direction(S.S and F.S) +INTEGER(KIND=JPIM) :: NCOMBFLEN ! Size of communication buffer + +INTEGER(KIND=JPIM) :: MTAGLETR ! Tag +INTEGER(KIND=JPIM) :: MTAGML ! Tag +INTEGER(KIND=JPIM) :: MTAGLG ! Tag +INTEGER(KIND=JPIM) :: MTAGGL ! Tag +INTEGER(KIND=JPIM) :: MTAGPART ! Tag +INTEGER(KIND=JPIM) :: MTAGDISTSP ! Tag +INTEGER(KIND=JPIM) :: MTAGLM ! Tag +INTEGER(KIND=JPIM) :: MTAGDISTGP ! Tag + +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPRCIDS(:) ! Array containing the process ids + +TYPE DISTR_TYPE +LOGICAL :: LGRIDONLY ! TRUE - only grid space structures are available +LOGICAL :: LWEIGHTED_DISTR ! TRUE - weighted distribution +LOGICAL :: LSPLIT ! TRUE - latitudes are shared between a-sets +LOGICAL :: LCPNMONLY ! TRUE - Compute Legendre polynomials only, not FFTs + +! SPECTRAL SPACE + +INTEGER(KIND=JPIM) :: NUMP ! No. of spectral waves handled by this processor +INTEGER(KIND=JPIM) :: NSPEC ! No. of complex spectral coefficients (on this PE) +INTEGER(KIND=JPIM) :: NSPEC2 ! 2*NSPEC +INTEGER(KIND=JPIM) :: NSPEC2MX ! maximun NSPEC2 among all PEs +INTEGER(KIND=JPIM) :: NTPEC2 ! cf. NSPEC2 but for truncation NTMAX +INTEGER(KIND=JPIM) :: NUMTP ! cf. NUMP but for truncation NTMAX + +INTEGER(KIND=JPIM) :: NSPOLEGL ! No. of legendre polynomials on this PE +INTEGER(KIND=JPIM) :: NLEI3D ! (NLEI3-1)/NPRTRW+1 + +INTEGER(KIND=JPIM) ,ALLOCATABLE :: MYMS(:) ! Wave numbers handled by this PE +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NUMPP(:) ! No. of wave numbers each wave set is + ! responsible for +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPOSSP(:) ! Not needed in transform? +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCM(:) ! Process that does the calc. for certain + ! wavenumber M +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NDIM0G(:) ! Defines partitioning of global spectral + ! fields among PEs + +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NASM0(:) ! Address in a spectral array of (m, n=m) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NATM0(:) ! Same as NASM0 but for NTMAX +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NALLMS(:) ! Wave numbers for all a-set concatenated + ! together to give all wave numbers in a-set + ! order. Used when global spectral norms + ! have to be gathered. +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPTRMS(:) ! Pointer to the first wave number of a given + ! a-set in nallms array. + + +! Legendre polynomials + +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLATLS(:,:) ! First latitude for which each a-set,bset calcul. +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLATLE(:,:) ! Last latitude for which each a-set,bset calcul. + +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPMT(:) ! Adress for legendre polynomial for + ! given M (NTMAX) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPMS(:) ! Adress for legendre polynomial for + ! given M (NSMAX) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPMG(:) ! Global version of NPMS + +! FOURIER SPACE + +INTEGER(KIND=JPIM) :: NDGL_FS ! Number of rows of latitudes for which this process is + ! performing Fourier Space calculations + +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTAGTF(:) ! Offset for specific latitude in + ! Fourier/gridpoint buffer +INTEGER(KIND=JPIM) :: NLENGTF ! Second dimension of Fourier/gridpoint buffer + ! (sum of (NLOEN+3) over local latitudes) + +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NULTPP(:) ! No of lats. for each wave_set (F.S) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCL(:) ! Process responsible for each lat. (F.S) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPTRLS(:) ! Pointer to first lat. (F.S) + +! NSTAGT0B to NLENGT1B: help arrays for spectral to fourier space transposition +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTAGT0B(:) ! Start adresses for segments within buffer + ! (according to processors to whom data + ! is going to be sent) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTAGT1B(:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPNTGTB0(:,:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPNTGTB1(:,:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLTSFTB(:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLTSGTB(:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: MSTABF(:) +INTEGER(KIND=JPIM) :: NLENGT0B ! dimension +INTEGER(KIND=JPIM) :: NLENGT1B ! dimension + +! GRIDPOINT SPACE + +INTEGER(KIND=JPIM) :: NDGL_GP ! D%NLSTLAT(MY_REGION_NS)-D%NFRSTLOFF +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NFRSTLAT(:) ! First lat of each a-set +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLSTLAT(:) ! Last lat of each a-set +INTEGER(KIND=JPIM) :: NFRSTLOFF ! Offset for first lat of own a-set + ! i.e. NFRSTLOFF=NFRSTLAT(MYSETA)-1 +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPTRLAT(:) ! Pointer to start of latitude +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPTRFRSTLAT(:) ! Pointer to the first latitude of each + ! a-set in NSTA and NONL arrays +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPTRLSTLAT(:) ! Pointer to the last latitude of each + ! a-set in NSTA and NONL arrays +INTEGER(KIND=JPIM) :: NPTRFLOFF ! Offset for pointer to the first latitude of own a-set + ! NSTA and NONL arrays, i.e. NPTRFRSTLAT(MYSETA)-1 +LOGICAL ,ALLOCATABLE :: LSPLITLAT(:) ! True if latitude is split over 2 a-sets + +! NSTA(R%NDGL+NPRGPNS-1,NPRGPEW) : Position of first grid column +! for the latitudes on a processor. The information is +! available for all processors. The b-sets are distinguished +! by the last dimension of NSTA(). The latitude band for +! each a-set is addressed by NPTRFRSTLAT(JASET), +! NPTRLSTLAT(JASET), and NPTRFLOFF=NPTRFRSTLAT(MYSETA) on +! this processors a-set. Each split latitude has two entries +! in NSTA(,:) which necessitates the rather complex +! addressing of NSTA(,:) and the overdimensioning of NSTA by +! NPRGPNS. +! NONL(R%NDGL+NPRGPNS-1,NPRGPEW) : Number of grid columns for +! the latitudes on a processor. Similar to NSTA() in data +! structure. +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTA(:,:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NONL(:,:) + +INTEGER(KIND=JPIM) :: NGPTOT ! Total number of grid columns on this PE +INTEGER(KIND=JPIM) :: NGPTOTG ! Total number of grid columns on the Globe +INTEGER(KIND=JPIM) :: NGPTOTMX ! Maximum number of grid columns on any of the PEs +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NGPTOTL(:,:) ! Number of grid columns on each PE. + +REAL(KIND=JPRBT) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set + +INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:) + +END TYPE DISTR_TYPE + +!flat versions of the above +INTEGER(KIND=JPIM) :: D_NUMP ! No. of spectral waves handled by this processor +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MYMS(:) ! Wave numbers handled by this PE +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT0B(:) ! Start adresses for segments within buffer + ! (according to processors to whom data + ! is going to be sent) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT1B(:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCL(:) ! Process responsible for each lat. (F.S) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB1(:,:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NASM0(:) ! Address in a spectral array of (m, n=m) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGTF(:) ! Offset for specific latitude in +INTEGER(KIND=JPIM) :: D_NDGL_FS ! Number of rows of latitudes for which this process is + ! performing Fourier Space calculations +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MSTABF(:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB0(:,:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCM(:) ! Process that does the calc. for certain +INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPTRLS(:) ! Pointer to first lat. (F.S) + + +! The offsets in the input and output arrays to the gemms. +! (1) are the offsets in the "inputs" of dirtrans ("outputs" invtrans) +! (2) are the offsets in the "outputs" of invtrans ("inputs" dirtrans) +INTEGER(KIND=JPIM), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) + +TYPE(DISTR_TYPE),ALLOCATABLE,TARGET :: DISTR_RESOL(:) +TYPE(DISTR_TYPE),POINTER :: D + +END MODULE TPM_DISTR + diff --git a/src/trans/gpu/internal/tpm_fft.F90 b/src/trans/gpu/internal/tpm_fft.F90 new file mode 100755 index 00000000..01594a80 --- /dev/null +++ b/src/trans/gpu/internal/tpm_fft.F90 @@ -0,0 +1,29 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_FFT +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + +! Module for Fourier transforms. + +IMPLICIT NONE + +SAVE + +TYPE FFT_TYPE + REAL(KIND=JPRBT) ,ALLOCATABLE :: TRIGS(:,:) ! list of trigonometric function values + INTEGER(KIND=JPIM),ALLOCATABLE :: NFAX(:,:) ! list of factors of truncation +END TYPE FFT_TYPE + +TYPE(FFT_TYPE),ALLOCATABLE,TARGET :: FFT_RESOL(:) +TYPE(FFT_TYPE),POINTER :: T + + +END MODULE TPM_FFT diff --git a/src/trans/gpu/internal/tpm_fields.F90 b/src/trans/gpu/internal/tpm_fields.F90 new file mode 100755 index 00000000..1f917730 --- /dev/null +++ b/src/trans/gpu/internal/tpm_fields.F90 @@ -0,0 +1,58 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_FIELDS + +USE PARKIND_ECTRANS ,ONLY : JPIM, JPIB, JPRB, JPRBT, JPRD +USE ISO_C_BINDING + +IMPLICIT NONE + +SAVE + +TYPE FIELDS_TYPE +REAL(KIND=JPRD) ,ALLOCATABLE :: RPNM(:,:) ! Legendre polynomials +REAL(KIND=JPRD) ,ALLOCATABLE :: RMU(:) ! sin(theta) for Gaussian latitudes +REAL(KIND=JPRBT) ,ALLOCATABLE :: RW(:) ! Weights of the Gaussian quadrature +REAL(KIND=JPRBT) ,ALLOCATABLE :: R1MU2(:) ! 1.-MU*MU, cos(theta)**2 +REAL(KIND=JPRBT) ,ALLOCATABLE :: RACTHE(:) ! 1./SQRT(R1MU2), 1/(cos(theta)) + +REAL(KIND=JPRBT) ,ALLOCATABLE :: REPSNM(:) ! eps(n,m) used in the Legendre transforms +REAL(KIND=JPRBT) ,ALLOCATABLE :: RN(:) ! n (to avoid integer to real conversion) +REAL(KIND=JPRBT) ,ALLOCATABLE :: RLAPIN(:) ! eigen-values of the inverse Laplace operator +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLTN(:) ! R%NTMAX+2-JN + +REAL(KIND=JPRBT) ,ALLOCATABLE :: RMU2(:) ! sin(theta) for dual input/output latitudes +REAL(KIND=JPRBT) ,ALLOCATABLE :: RACTHE2(:)! 1./SQRT(R1MU2), 1/(cos(theta)) dual input/output latitudes +END TYPE FIELDS_TYPE + +!flat copies of the above +REAL(KIND=JPRBT) ,ALLOCATABLE :: F_RW(:) ! Weights of the Gaussian quadrature +REAL(KIND=JPRBT) ,ALLOCATABLE :: F_RN(:) ! n (to avoid integer to real conversion) +REAL(KIND=JPRBT) ,ALLOCATABLE :: F_RLAPIN(:) ! eigen-values of the inverse Laplace operator +REAL(KIND=JPRBT) ,ALLOCATABLE :: F_RACTHE(:) ! eigen-values of the inverse Laplace operator + +TYPE(FIELDS_TYPE),ALLOCATABLE,TARGET :: FIELDS_RESOL(:) +TYPE(FIELDS_TYPE),POINTER :: F + +! scratch arrays for ltinv and ltdir and associated dimension variables + +REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:,:,:) !! JPRL for 1/2 +REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:,:,:) !! JPRL for 1/2 + +! for m=0 in ledir_mod: +REAL(KIND=JPRD),ALLOCATABLE :: ZAA0(:,:) +REAL(KIND=JPRD),ALLOCATABLE :: ZAS0(:,:) +INTEGER(KIND=JPIM) :: KMLOC0 + +REAL(KIND=JPRBT),ALLOCATABLE :: ZEPSNM(:,:) + +END MODULE TPM_FIELDS diff --git a/src/trans/gpu/internal/tpm_flt.F90 b/src/trans/gpu/internal/tpm_flt.F90 new file mode 100755 index 00000000..c1fb4be1 --- /dev/null +++ b/src/trans/gpu/internal/tpm_flt.F90 @@ -0,0 +1,64 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_FLT + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT, JPRD +USE SEEFMM_MIX +IMPLICIT NONE + +SAVE + + +TYPE FLT_TYPE +INTEGER(KIND=JPIM) :: NSPOLEGL +INTEGER(KIND=JPIM) :: NDGNH +INTEGER(KIND=JPIM) :: INS2 +INTEGER(KIND=JPIM) :: INA2 +REAL(KIND=JPRBT) ,POINTER :: RPNMS(:,:) ! Legendre polynomials +REAL(KIND=JPRBT) ,POINTER :: RPNMA(:,:) ! Legendre polynomials +REAL(KIND=JPRD) ,POINTER :: RPNMDS(:,:) ! Legendre polynomials +REAL(KIND=JPRD) ,POINTER :: RPNMDA(:,:) ! Legendre polynomials +REAL(KIND=JPRBT) :: RCS +REAL(KIND=JPRBT) :: RCA +!REAL(KIND=JPRBT) ,POINTER :: RPNMCDO(:,:) ! Legendre polynomials for C-D formula at orig roots +!REAL(KIND=JPRBT) ,POINTER :: RPNMCDD(:,:) ! Legendre polynomials for C-D formula at dual roots +REAL(KIND=JPRBT) ,POINTER :: RPNMWI(:,:) ! special weights +REAL(KIND=JPRBT) ,POINTER :: RPNMWO(:,:) ! special weights +INTEGER(KIND=JPIM) :: ISLD ! starting latitude dual + +! Butterfly + +INTEGER(KIND=JPIM) :: MAXCOLS + +END TYPE FLT_TYPE + +TYPE FLT_TYPE_WRAP +TYPE(FLT_TYPE),ALLOCATABLE :: FA(:) +LOGICAL :: LDLL +LOGICAL :: LSHIFTLL +LOGICAL :: LUSE_BELUSOV +LOGICAL :: LKEEPRPNM +LOGICAL :: LSOUTHPNM ! .TRUE. to compute Legendre polynomials on southern hemisphere +INTEGER(KIND=JPIM) :: IMLOC +INTEGER(KIND=JPIM) :: ITHRESHOLD +INTEGER(KIND=JPIM) :: NDGNHD ! dual set dimension +INTEGER(KIND=JPIM) :: NDLON ! dual number of longitudes +INTEGER(KIND=JPIM) :: NDGL ! dual number of latitudes +LOGICAL :: LSYM +TYPE(FMM_TYPE),POINTER :: FMM_INTI ! FMM interpolation + +END TYPE FLT_TYPE_WRAP + +TYPE(FLT_TYPE_WRAP),ALLOCATABLE,TARGET :: FLT_RESOL(:) +TYPE(FLT_TYPE_WRAP),POINTER :: S + + +END MODULE TPM_FLT diff --git a/src/trans/gpu/internal/tpm_gen.F90 b/src/trans/gpu/internal/tpm_gen.F90 new file mode 100755 index 00000000..cf38f749 --- /dev/null +++ b/src/trans/gpu/internal/tpm_gen.F90 @@ -0,0 +1,45 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_GEN + +! Module for general control variables. + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + +IMPLICIT NONE + +SAVE + +INTEGER(KIND=JPIM) :: NOUT ! Unit number for "standard" output +INTEGER(KIND=JPIM) :: NERR ! Unit number for error messages +INTEGER(KIND=JPIM) :: NPRINTLEV ! Printing level, 0=no print, 1=standard,2=debug + +INTEGER(KIND=JPIM) :: MSETUP0 = 0 ! Control of setup calls +INTEGER(KIND=JPIM) :: NMAX_RESOL = 0 ! Maximum allowed number of resolutions +INTEGER(KIND=JPIM) :: NCUR_RESOL = 0 ! Current resolution +INTEGER(KIND=JPIM) :: NDEF_RESOL = 0 ! Number of defined resolutions +INTEGER(KIND=JPIM) :: NPROMATR ! Packet size for transform (in no of fields) + ! NPROMATR=0 means do all fields together (dflt) + +LOGICAL :: LALLOPERM ! Allocate some shared data structures permanently +LOGICAL :: LMPOFF ! true: switch off message passing +LOGICAL :: LSYNC_TRANS ! true: activate barriers in trmtol and trltom + +! Use of synchronization/blocking in Transpose (some networks do get flooded) +! 0 = Post IRECVs up-front, use ISENDs, use WAITANY to recv data (default) +! 1 = Use ISENDs, use blocking RECVs, add barrier at the end of each cycle +! 2 = Use buffered SENDs, use blocking RECVs, add barrier at the end of each cycle +INTEGER(KIND=JPIM) :: NTRANS_SYNC_LEVEL = 0 + +LOGICAL, ALLOCATABLE :: LENABLED(:) ! true: the resolution is enabled (it has been + ! initialised and has not been released afterward) + +END MODULE TPM_GEN diff --git a/src/trans/gpu/internal/tpm_geometry.F90 b/src/trans/gpu/internal/tpm_geometry.F90 new file mode 100755 index 00000000..ce1de2f7 --- /dev/null +++ b/src/trans/gpu/internal/tpm_geometry.F90 @@ -0,0 +1,45 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_GEOMETRY + +! Module containing data describing Gaussian grid. + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + +IMPLICIT NONE + +SAVE + +TYPE GEOM_TYPE +INTEGER(KIND=JPIM),ALLOCATABLE :: NLOEN(:) ! NUMBER OF POINTS ON A PARALLEL +INTEGER(KIND=JPIM),ALLOCATABLE :: NMEN(:) ! ASSOCIATED CUT-OFF WAVE NUMBER +INTEGER(KIND=JPIM),ALLOCATABLE :: NDGLU(:) ! NUMBER OF HEMISPERIC LATITUDES +! FOR A GIVEN WAVE NUMBER M + +LOGICAL :: LAM ! LAM geometry if T, Global geometry if F +LOGICAL :: LREDUCED_GRID ! Reduced Gaussian grid if T +! quadratic Gaussian grid otherwise. +REAL(KIND=JPRBT) :: RSTRET ! Stretching factor (for Legendre polynomials +! computed on stretched latitudes only) +END TYPE GEOM_TYPE + +!flat copies of the above +INTEGER(KIND=JPIM),ALLOCATABLE :: G_NDGLU(:) ! NUMBER OF HEMISPERIC LATITUDES +INTEGER(KIND=JPIM),ALLOCATABLE :: G_NMEN(:) ! ASSOCIATED CUT-OFF WAVE NUMBER +INTEGER(KIND=JPIM) :: G_NMEN_MAX +INTEGER(KIND=JPIM),ALLOCATABLE :: G_NLOEN(:) ! NUMBER OF POINTS ON A PARALLEL +INTEGER(KIND=JPIM) :: G_NLOEN_MAX + +TYPE(GEOM_TYPE),ALLOCATABLE,TARGET :: GEOM_RESOL(:) +TYPE(GEOM_TYPE),POINTER :: G + +END MODULE TPM_GEOMETRY diff --git a/src/trans/gpu/internal/tpm_hicfft.F90 b/src/trans/gpu/internal/tpm_hicfft.F90 new file mode 100755 index 00000000..c59df0be --- /dev/null +++ b/src/trans/gpu/internal/tpm_hicfft.F90 @@ -0,0 +1,346 @@ +! (C) Copyright 2014- ECMWF. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_HICFFT + + ! Author. + ! ------- + ! George Mozdzynski + ! + ! Modifications. + ! -------------- + ! Original October 2014 + ! HICFFT abstraction for CUDA and HIP August 2023 B. Reuter + + USE, INTRINSIC :: ISO_C_BINDING + + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT + USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE + + IMPLICIT NONE + + SAVE + + PRIVATE + PUBLIC CREATE_PLAN_FFT, DESTROY_PLAN_FFT, DESTROY_ALL_PLANS_FFT, INIT_PLANS_FFT, EXECUTE_PLAN_FFT, & + & HICFFT_RESOL, HICT, EXECUTE_DIR_FFT, EXECUTE_INV_FFT + + TYPE HICFFT_TYPE + INTEGER(KIND=C_INT),POINTER :: N_PLANS(:) + TYPE(HICFFT_PLAN),POINTER :: HICFFT_PLANS(:) + INTEGER(KIND=C_INT) :: N_MAX=0 + END TYPE HICFFT_TYPE + + + TYPE HICFFT_PLAN + TYPE(C_PTR) :: NPLAN + INTEGER(KIND=C_INT) :: NLOT + INTEGER(KIND=C_INT) :: NSTRIDE + INTEGER(KIND=C_INT) :: NTYPE + TYPE(HICFFT_PLAN),POINTER :: NEXT_PLAN => NULL() + END TYPE HICFFT_PLAN + + TYPE(HICFFT_TYPE),ALLOCATABLE,TARGET :: HICFFT_RESOL(:) + TYPE(HICFFT_TYPE),POINTER :: HICT + +INTERFACE EXECUTE_DIR_FFT + MODULE PROCEDURE EXECUTE_DIR_FFT_FLOAT,EXECUTE_DIR_FFT_DOUBLE +END INTERFACE + +INTERFACE EXECUTE_INV_FFT + MODULE PROCEDURE EXECUTE_INV_FFT_FLOAT,EXECUTE_INV_FFT_DOUBLE +END INTERFACE + + + + ! ------------------------------------------------------------------ + CONTAINS + ! ------------------------------------------------------------------ + + + SUBROUTINE INIT_PLANS_FFT(KDLON) + INTEGER(KIND=C_INT),INTENT(IN) :: KDLON + + HICT%N_MAX=KDLON + ALLOCATE(HICT%HICFFT_PLANS(HICT%N_MAX)) + ALLOCATE(HICT%N_PLANS(HICT%N_MAX)) + HICT%N_PLANS(:)=0 + RETURN + END SUBROUTINE INIT_PLANS_FFT + + + SUBROUTINE CREATE_PLAN_FFT(KPLAN,KTYPE,KN,KLOT,KSTRIDE) + TYPE(C_PTR),INTENT(OUT) :: KPLAN + INTEGER(KIND=C_INT),INTENT(IN) :: KTYPE,KN,KLOT,KSTRIDE + + TYPE(C_PTR) :: IPLAN + INTEGER(KIND=C_INT) :: IRANK, ISTRIDE + INTEGER(KIND=C_INT) :: JL, JN + INTEGER(KIND=C_INT) :: IRDIST,ICDIST,IN(1),IEMBED(1) + LOGICAL :: LLFOUND + TYPE(HICFFT_PLAN),POINTER :: CURR_HICFFT_PLAN,START_HICFFT_PLAN + + INTERFACE + SUBROUTINE HICFFT_CREATE_PLAN(KPLAN,KTYPE,KN,KLOT,KSTRIDE) BIND(C,NAME="hicfft_create_plan_") + USE, INTRINSIC :: ISO_C_BINDING + TYPE(C_PTR), INTENT(OUT) :: KPLAN + INTEGER(C_INT), INTENT(IN) :: KTYPE,KN,KLOT,KSTRIDE + END SUBROUTINE HICFFT_CREATE_PLAN + END INTERFACE + + IF( KN > HICT%N_MAX )THEN + stop 'CREATE_PLAN_FFT: KN > N_MAX THAT WAS INITIALISED IN INIT_PLANS_FFT' + ENDIF + + IRANK=1 + ISTRIDE=1 + IN(1)=KN + IEMBED(1)=IN(1) + ICDIST=KN/2+1 + IRDIST=ICDIST*2 + + !!$OMP CRITICAL + LLFOUND=.FALSE. + CURR_HICFFT_PLAN=>HICT%HICFFT_PLANS(KN) + ! search for plan in existing plans + DO JL=1,HICT%N_PLANS(KN) + IF( KLOT == CURR_HICFFT_PLAN%NLOT .AND. KTYPE == CURR_HICFFT_PLAN%NTYPE & + & .AND. KSTRIDE == CURR_HICFFT_PLAN%NSTRIDE)THEN + LLFOUND=.TRUE. + IPLAN=CURR_HICFFT_PLAN%NPLAN + EXIT + ELSEIF( JL /= HICT%N_PLANS(KN) )THEN + CURR_HICFFT_PLAN=>CURR_HICFFT_PLAN%NEXT_PLAN + ENDIF + ENDDO + IF( .NOT.LLFOUND )THEN + CALL HICFFT_CREATE_PLAN(IPLAN,KTYPE,KN,KLOT,KSTRIDE) + KPLAN=IPLAN + HICT%N_PLANS(KN)=HICT%N_PLANS(KN)+1 + IF( HICT%N_PLANS(KN) /= 1 )THEN + ALLOCATE(CURR_HICFFT_PLAN%NEXT_PLAN) + CURR_HICFFT_PLAN=>CURR_HICFFT_PLAN%NEXT_PLAN + ENDIF + CURR_HICFFT_PLAN%NPLAN=IPLAN + CURR_HICFFT_PLAN%NLOT=KLOT + CURR_HICFFT_PLAN%NSTRIDE=KSTRIDE + CURR_HICFFT_PLAN%NTYPE=KTYPE + CURR_HICFFT_PLAN%NEXT_PLAN=>NULL() + ELSE + KPLAN=IPLAN + ENDIF + !!$OMP END CRITICAL + END SUBROUTINE CREATE_PLAN_FFT + + + SUBROUTINE DESTROY_PLAN_FFT(KPLAN) + TYPE(C_PTR),INTENT(IN) :: KPLAN + INTERFACE + SUBROUTINE HICFFT_DESTROY_PLAN(KPLAN) BIND(C, NAME="hicfft_destroy_plan_") + USE, INTRINSIC :: ISO_C_BINDING + TYPE(C_PTR), VALUE, INTENT(IN) :: KPLAN + END SUBROUTINE HICFFT_DESTROY_PLAN + END INTERFACE + + CALL HICFFT_DESTROY_PLAN(KPLAN) + END SUBROUTINE DESTROY_PLAN_FFT + + + SUBROUTINE DESTROY_ALL_PLANS_FFT + INTEGER(KIND=C_INT) :: JL, JN + TYPE(HICFFT_PLAN),POINTER :: CURR_HICFFT_PLAN + + IF( .NOT. ASSOCIATED(HICT) ) THEN + RETURN + ENDIF + + IF ( .NOT. ASSOCIATED(HICT%HICFFT_PLANS) .OR. .NOT. ASSOCIATED(HICT%N_PLANS) ) THEN + RETURN + ENDIF + + DO JN = 1, HICT%N_MAX + CURR_HICFFT_PLAN=>HICT%HICFFT_PLANS(JN) + DO JL = 1, HICT%N_PLANS(JN) + IF( ASSOCIATED(CURR_HICFFT_PLAN) ) THEN + CALL DESTROY_PLAN_FFT(CURR_HICFFT_PLAN%NPLAN) + CURR_HICFFT_PLAN=>CURR_HICFFT_PLAN%NEXT_PLAN + ENDIF + ENDDO + ENDDO + + DEALLOCATE(HICT%HICFFT_PLANS) + DEALLOCATE(HICT%N_PLANS) + END SUBROUTINE DESTROY_ALL_PLANS_FFT + + SUBROUTINE EXECUTE_PLAN_FFT(KN,N,X_IN,X_OUT,PLAN_PTR) + TYPE(C_PTR) :: PLAN_PTR + INTEGER(KIND=C_INT) :: KN + INTEGER(KIND=C_INT) :: N + REAL(KIND=JPRBT), TARGET :: X_IN + REAL(KIND=JPRBT), TARGET :: X_OUT + + INTERFACE + SUBROUTINE HICFFT_EXECUTE_PLAN (KN, N, X_IN_PTR, X_OUT_PTR, PLAN_PTR) & + & BIND(C,NAME="hicfft_execute_plan_") + USE, INTRINSIC :: ISO_C_BINDING + TYPE(C_PTR), VALUE :: PLAN_PTR + INTEGER(KIND=C_INT), VALUE :: KN + INTEGER(KIND=C_INT), VALUE :: N + TYPE(C_PTR), VALUE :: X_IN_PTR, X_OUT_PTR + END SUBROUTINE HICFFT_EXECUTE_PLAN + END INTERFACE + +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(X_IN,X_OUT) +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(X_IN,X_OUT) +#endif + CALL HICFFT_EXECUTE_PLAN(KN,N,C_LOC(X_IN),C_LOC(X_OUT),PLAN_PTR) +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif + + END SUBROUTINE EXECUTE_PLAN_FFT + +SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,ALLOC) + USE PARKIND_ECTRANS ,ONLY : JPIM + + IMPLICIT NONE + + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + INTERFACE + SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + & BIND(C, NAME="execute_dir_fft_float") + USE ISO_C_BINDING + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(*) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT + TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC + END SUBROUTINE + END INTERFACE + +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) +#endif + CALL EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + +END SUBROUTINE EXECUTE_DIR_FFT_FLOAT +SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,ALLOC) + USE PARKIND_ECTRANS ,ONLY : JPIM + + IMPLICIT NONE + + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + INTERFACE + SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + & BIND(C, NAME="execute_dir_fft_double") + USE ISO_C_BINDING + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(*) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT + TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC + END SUBROUTINE + END INTERFACE + +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) +#endif + CALL EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif + +END SUBROUTINE EXECUTE_DIR_FFT_DOUBLE + +SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,ALLOC) + USE PARKIND_ECTRANS ,ONLY : JPIM + + IMPLICIT NONE + + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + INTERFACE + SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + & BIND(C, NAME="execute_inv_fft_float") + USE ISO_C_BINDING + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(*) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT + TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC + END SUBROUTINE + END INTERFACE + +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) +#endif + CALL EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif +END SUBROUTINE + +SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,ALLOC) + USE PARKIND_ECTRANS ,ONLY : JPIM + + IMPLICIT NONE + + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + INTERFACE + SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + & BIND(C, NAME="execute_inv_fft_double") + USE ISO_C_BINDING + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(*) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT + TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC + END SUBROUTINE + END INTERFACE + +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) +#endif + CALL EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif +END SUBROUTINE + + +END MODULE TPM_HICFFT diff --git a/src/trans/gpu/internal/tpm_pol.F90 b/src/trans/gpu/internal/tpm_pol.F90 new file mode 100755 index 00000000..f563d960 --- /dev/null +++ b/src/trans/gpu/internal/tpm_pol.F90 @@ -0,0 +1,120 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_POL + +! MODIFICATIONS. +! -------------- +! R. El Khatib 17-Feb-2016 Optional allocation/computation of DDC/DDD/DDE +! since they are (big and) not used in supolf. + +USE EC_PARKIND ,ONLY : JPRD, JPIM + +IMPLICIT NONE + +SAVE + +REAL(KIND=JPRD),ALLOCATABLE :: DDC(:,:), DDD(:,:), DDE(:,:) +REAL(KIND=JPRD),ALLOCATABLE :: DDA(:), DDI(:), DDH(:) + +REAL(KIND=JPRD),ALLOCATABLE :: DFA(:), DFB(:), DFF(:), DFG(:), DFI(:), DFH(:) + +CONTAINS +!====================================================================== +SUBROUTINE INI_POL(KNSMAX,LDFAST) + +INTEGER(KIND=JPIM), INTENT(IN) :: KNSMAX +LOGICAL, INTENT(IN), OPTIONAL :: LDFAST + +REAL(KIND=JPRD) :: DC,DD,DE +INTEGER(KIND=JPIM) :: KKN, KKM + +INTEGER(KIND=JPIM) :: JN, JM +LOGICAL :: LLFAST + +DC(KKN,KKM)=SQRT( (REAL(2*KKN+1,JPRD)*REAL(KKN+KKM-1,JPRD)& + &*REAL(KKN+KKM-3,JPRD))& + &/ (REAL(2*KKN-3,JPRD)*REAL(KKN+KKM,JPRD)& + &*REAL(KKN+KKM-2,JPRD)) ) +DD(KKN,KKM)=SQRT( (REAL(2*KKN+1,JPRD)*REAL(KKN+KKM-1,JPRD)& + &*REAL(KKN-KKM+1,JPRD))& + &/ (REAL(2*KKN-1,JPRD)*REAL(KKN+KKM,JPRD)& + &*REAL(KKN+KKM-2,JPRD)) ) +DE(KKN,KKM)=SQRT( (REAL(2*KKN+1,JPRD)*REAL(KKN-KKM,JPRD))& + &/ (REAL(2*KKN-1,JPRD)*REAL(KKN+KKM,JPRD)) ) + +IF (PRESENT(LDFAST)) THEN + LLFAST=LDFAST +ELSE + LLFAST=.FALSE. +ENDIF +IF (.NOT.LLFAST) ALLOCATE( DDC(0:KNSMAX,0:KNSMAX) ) +IF (.NOT.LLFAST) ALLOCATE( DDD(0:KNSMAX,0:KNSMAX) ) +IF (.NOT.LLFAST) ALLOCATE( DDE(0:KNSMAX,0:KNSMAX) ) + +ALLOCATE( DDA(0:KNSMAX) ) +ALLOCATE( DDI(0:KNSMAX) ) +ALLOCATE( DDH(0:KNSMAX) ) + +ALLOCATE( DFA(0:KNSMAX) ) +ALLOCATE( DFB(0:KNSMAX) ) +ALLOCATE( DFF(0:KNSMAX) ) +ALLOCATE( DFG(0:KNSMAX) ) +ALLOCATE( DFI(0:KNSMAX) ) +ALLOCATE( DFH(0:KNSMAX) ) + + +DO JN=1,KNSMAX + DFA(JN) = 1._JPRD/SQRT(REAL(JN*(JN+1),JPRD)) + DFB(JN) = SQRT(REAL(2*JN+1,JPRD)/REAL(JN*(JN+1),JPRD)) + DFF(JN) = REAL(2*JN-1,JPRD)/REAL(JN,JPRD) + DFG(JN) = REAL(JN-1,JPRD)/REAL(JN,JPRD) + DFI(JN) = REAL(JN,JPRD) + DFH(JN) = SQRT(REAL(2*JN+1,JPRD)/REAL(2*JN,JPRD)) +ENDDO + +IF (.NOT.LLFAST) THEN + DO JN=3,KNSMAX + DO JM=2,JN-1 + DDC(JM,JN) = DC(JN,JM) + DDD(JM,JN) = DD(JN,JM) + DDE(JM,JN) = DE(JN,JM) + ENDDO + ENDDO +ENDIF + +DO JN=1,KNSMAX + DDA(JN) = 1._JPRD/SQRT(REAL(JN*(JN+1),JPRD)) + DDI(JN) = REAL(JN,JPRD) + DDH(JN) = SQRT(REAL(2*JN+1,JPRD)/REAL(2*JN,JPRD)) +ENDDO + +END SUBROUTINE INI_POL + +SUBROUTINE END_POL + +IF (ALLOCATED (DDC) ) DEALLOCATE( DDC ) +IF (ALLOCATED (DDD) ) DEALLOCATE( DDD ) +IF (ALLOCATED (DDE) ) DEALLOCATE( DDE ) + +DEALLOCATE( DDA ) +DEALLOCATE( DDI ) +DEALLOCATE( DDH ) + +DEALLOCATE( DFA ) +DEALLOCATE( DFB ) +DEALLOCATE( DFF ) +DEALLOCATE( DFG ) +DEALLOCATE( DFI ) +DEALLOCATE( DFH ) + +END SUBROUTINE END_POL + +END MODULE TPM_POL diff --git a/src/trans/gpu/internal/tpm_stats.F90 b/src/trans/gpu/internal/tpm_stats.F90 new file mode 100644 index 00000000..c72a3bc7 --- /dev/null +++ b/src/trans/gpu/internal/tpm_stats.F90 @@ -0,0 +1,66 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_STATS + +IMPLICIT NONE + +CHARACTER(LEN=32) :: DESCRIPTIONS(100) + +CONTAINS + +SUBROUTINE GSTATS_LABEL_NVTX(KNUM,CTYPE,CDESC) +USE EC_PARKIND ,ONLY : JPIM +IMPLICIT NONE +INTEGER(KIND=JPIM) :: KNUM +CHARACTER(*) CDESC +CHARACTER(*) CTYPE + +IF (KNUM >= 400 .AND. KNUM < 500) THEN + DESCRIPTIONS(KNUM-400+1) = CDESC +ENDIF +CALL GSTATS_LABEL(KNUM,CTYPE,CDESC) +END SUBROUTINE + +SUBROUTINE GSTATS_NVTX(KNUM,KSWITCH) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +#if defined(__NVCOMPILER) + USE NVTX +#endif + + IMPLICIT NONE + + INTEGER(KIND=JPIM),INTENT(IN) :: KNUM + INTEGER(KIND=JPIM),INTENT(IN) :: KSWITCH + INTEGER(KIND=JPIM) :: ICOLOR + +#if defined(__NVCOMPILER) + IF (KNUM >= 400 .AND. KNUM < 500) THEN + IF (KSWITCH == 0) THEN + ICOLOR=0 + IF (KNUM>=430) ICOLOR=10 !LB markers + IF (KNUM==410) ICOLOR=13 !DIR COMPLETE + IF (KNUM==420) ICOLOR=14 !INV COMPLETE + IF (ICOLOR /= 0) THEN + CALL NVTXSTARTRANGE(DESCRIPTIONS(KNUM-400+1),ICOLOR) + ELSE + CALL NVTXSTARTRANGE(DESCRIPTIONS(KNUM-400+1)) + ENDIF + ELSEIF (KSWITCH == 1) THEN + CALL NVTXENDRANGE() + ENDIF + ENDIF + CALL GSTATS(KNUM,KSWITCH) +#endif +END SUBROUTINE GSTATS_NVTX + +END MODULE TPM_STATS + diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 new file mode 100755 index 00000000..363d1b9c --- /dev/null +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -0,0 +1,69 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_TRANS + +! Module to contain variables "local" to a specific call to a transform + +! +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE +USE ISO_C_BINDING, ONLY: C_INT8_T + +IMPLICIT NONE + +SAVE + +!INTEGER_M :: NF_UV ! Number of u-v fields (spectral/fourier space) +!INTEGER_M :: NF_SCALARS ! Number of scalar fields (spectral/fourier space) +!INTEGER_M :: NF_SCDERS ! Number of fields for derivatives of scalars + ! (inverse transform, spectral/fourier space) +!INTEGER_M :: NF_OUT_LT ! Number of fields that comes out of Inverse + ! Legendre transform +INTEGER(KIND=JPIM) :: NF_SC2 ! Number of fields in "SPSC2" arrays. +INTEGER(KIND=JPIM) :: NF_SC3A ! Number of fields in "SPSC3A" arrays. +INTEGER(KIND=JPIM) :: NF_SC3B ! Number of fields in "SPSC3B" arrays. + +!LOGICAL :: LUV ! uv fields requested +!LOGICAL :: LSCALAR ! scalar fields requested +LOGICAL :: LVORGP ! vorticity requested +LOGICAL :: LDIVGP ! divergence requested +LOGICAL :: LUVDER ! E-W derivatives of U and V requested +LOGICAL :: LSCDERS ! derivatives of scalar variables are req. +LOGICAL :: LATLON ! lat-lon output requested + +!INTEGER_M :: NLEI2 ! 8*NF_UV + 2*NF_SCALARS + 2*NF_SCDERS (dimension in + ! inverse Legendre transform) +!INTEGER_M :: NLED2 ! 2*NF_FS (dimension in direct Legendre transform) + +!INTEGER_M :: NF_FS ! Total number of fields in Fourier space + +!INTEGER_M :: NF_GP ! Total number of field in grid-point space +!INTEGER_M :: NF_UV_G ! Global version of NF_UV (grid-point space) +!INTEGER_M :: NF_SCALARS_G ! Global version of NF_SCALARS (grid-point space) + +REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF_IN(:) ! Fourier buffer +REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) ! Fourier buffer + +INTEGER(KIND=JPIM) :: NPROMA ! Blocking factor for gridpoint input/output +INTEGER(KIND=JPIM) :: NGPBLKS ! Number of NPROMA blocks + +LOGICAL :: LGPNORM = .FALSE. ! indicates whether transform is being done for gpnorm + +! This is used in fourier space and in spectral space. It's reused among +! the transforms because we cannot reallocate - the captured graphs +! should not be modified. Hence, we keep it if it is large enough, otherwise +! we adapt the size. After 2 iterations this should lead to constant runtimes +! (the first iteration is used to get the max buffer size, the second iteration +! is going to recreate the graphs if needed) +TYPE(GROWING_ALLOCATION_TYPE) :: GROWING_ALLOCATION + +END MODULE TPM_TRANS diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 new file mode 100755 index 00000000..02cc5be3 --- /dev/null +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -0,0 +1,751 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 1995- ECMWF. +! (C) Copyright 1995- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TRGTOL_MOD + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRGTOL_HANDLE, TRGTOL, PREPARE_TRGTOL + + TYPE TRGTOL_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS, HCOMBUFR_AND_REEL + END TYPE +CONTAINS + FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) + USE PARKIND_ECTRANS, ONLY : JPIM, JPRB, JPRBT + USE TPM_DISTR, ONLY : D + USE BUFFERED_ALLOCATOR_MOD + USE ISO_C_BINDING, ONLY: C_SIZE_T + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP, KF_FS + TYPE(TRGTOL_HANDLE) :: HTRGTOL + + REAL(KIND=JPRBT) :: DUMMY + + INTEGER(KIND=C_SIZE_T) :: NELEM + + HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, int(KF_GP*D%NGPTOT*SIZEOF(DUMMY),kind=c_size_t)) + + NELEM = KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! ZCOMBUFR + NELEM = NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! PREEL_REAL + HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM) + END FUNCTION PREPARE_TRGTOL + + SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& + &PGP,PGPUV,PGP3A,PGP3B,PGP2,KPTRGP,KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2) + + !**** *TRGTOL * - transposition of grid point data from column + ! structure to latitudinal. Reorganize data between + ! grid point calculations and direct Fourier Transform + + ! Version using CUDA-aware MPI + + ! Purpose. + ! -------- + + + !** Interface. + ! ---------- + ! *call* *trgtol(...) + + ! Explicit arguments : + ! -------------------- + ! PREEL_REAL - Latitudinal data ready for direct FFT (output) + ! PGP - Blocked grid point data (input) + + ! Implicit arguments : + ! -------------------- + + ! Method. + ! ------- + ! See documentation + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! MPP Group *ECMWF* + + ! Modifications. + ! -------------- + ! Original: 95-10-01 + ! D.Dent : 97-08-04 Reorganisation to allow + ! NPRTRV to differ from NPRGPEW + ! : 98-06-17 add mailbox control logic (from TRLTOM) + ! =99-03-29= Mats Hamrud and Deborah Salmond + ! JUMP in FFT's changed to 1 + ! KINDEX introduced and ZCOMBUF not used for same PE + ! 01-11-23 Deborah Salmond and John Hague + ! LIMP_NOOLAP Option for non-overlapping message passing + ! and buffer packing + ! 01-12-18 Peter Towers + ! Improved vector performance of GTOL_PACK,GTOL_UNPACK + ! 03-04-02 G. Radnoti: call barrier always when nproc>1 + ! 08-01-01 G.Mozdzynski: cleanup + ! 09-01-02 G.Mozdzynski: use non-blocking recv and send + ! ------------------------------------------------------------------ + + + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT, jprd + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS + USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV + USE PE2SET_MOD ,ONLY : PE2SET + USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML + USE OML_MOD ,ONLY : OML_MY_THREAD + USE MPI_F08 + USE TPM_STATS ,ONLY : GSTATS => GSTATS_NVTX + USE TPM_TRANS ,ONLY : NPROMA + USE ISO_C_BINDING ,ONLY : C_SIZE_T, c_float, c_double, c_int8_t + USE BUFFERED_ALLOCATOR_MOD + USE OPENACC_EXT + + IMPLICIT NONE + + REAL(KIND=JPRBT),INTENT(OUT), POINTER :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:), KVSETSC(:), KVSETSC3A(:), KVSETSC3B(:), KVSETSC2(:) + REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP(:,:,:), PGPUV(:,:,:,:), PGP3A(:,:,:,:), PGP3B(:,:,:,:), PGP2(:,:,:) + + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRGTOL_HANDLE), INTENT(IN) :: HTRGTOL + + ! LOCAL VARIABLES + + ! LOCAL INTEGER SCALARS + REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) + + INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) + INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIM) :: IREQ (NPROC*2) + INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) + INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) + + INTEGER(KIND=JPIM) :: IFIRSTLAT, IGL, IGLL, ILAST,& + &ILASTLAT, ILEN, JROC, IPOS, ISETA, & + &ISETB, IRECV, & + &ISETV, ISEND, JBLK, JFLD, & + &JGL, JI, JK, JL, ISETW, IFLD, & + &II,IBUFLENR,IRECV_COUNTS, IPROC,IFLDS, & + &ISEND_COUNTS,INS,INR,IR, JKL, PBOUND, IERROR, ILOCAL_LAT + INTEGER(KIND=JPIM) :: KF, KGL, KI, J3 + + INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP + INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V + INTEGER(KIND=JPIM) :: ISEND_FIELD_COUNT(NPRTRV),ISEND_FIELD_COUNT_V + INTEGER(KIND=JPIM) :: ISEND_WSET_SIZE(NPRTRW),ISEND_WSET_SIZE_V + INTEGER(KIND=JPIM) :: ISEND_WSET_OFFSET(NPRTRW+1), ISEND_WSET_OFFSET_V + INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + INTEGER(KIND=JPIM) :: IFLDA(KF_GP) + INTEGER(KIND=JPIM) :: IVSET(KF_GP) + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + + INTEGER(JPIM), PARAMETER :: PGP_INDICES_UV = 1 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP2 = 2 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3A = 3 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3B = 4 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_END = 5 + INTEGER(JPIM) :: PGP_INDICES(PGP_INDICES_END) + + TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins... + INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT = 0 + + TYPE(MPI_COMM) :: LOCAL_COMM + TYPE(MPI_REQUEST) :: IREQUEST(2*NPROC) + +#ifdef PARKINDTRANS_SINGLE +#define TRGTOL_DTYPE MPI_FLOAT +#else +#define TRGTOL_DTYPE MPI_DOUBLE +#endif + LOCAL_COMM%MPI_VAL = MPL_COMM_OML( OML_MY_THREAD() ) + + ! ------------------------------------------------------------------ + + !* 0. Some initializations + ! -------------------- + ! Note we have either + ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or + ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2) + ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which + ! should match PSPXXX and PGPXXX arrays) + IOFF=0 + IF(PRESENT(KVSETUV)) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) + IOFF=IOFF+KF_UV_G + ELSE + IVSET(IOFF+1:IOFF+KF_UV_G) = -1 + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = -1 + IOFF=IOFF+KF_UV_G + ENDIF + IF(PRESENT(KVSETSC)) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = KVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + ELSE + IF(PRESENT(KVSETSC2)) THEN + IVSET(IOFF+1:IOFF+SIZE(KVSETSC2)) = KVSETSC2(:) + IOFF=IOFF+SIZE(KVSETSC2) + ENDIF + IF(PRESENT(KVSETSC3A)) THEN + DO J3=1,SIZE(PGP3A,3) + IVSET(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) + IOFF=IOFF+SIZE(KVSETSC3A) + ENDDO + ENDIF + IF(PRESENT(KVSETSC3B)) THEN + DO J3=1,SIZE(PGP3B,3) + IVSET(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) + IOFF=IOFF+SIZE(KVSETSC3B) + ENDDO + ENDIF + ENDIF + + IF (IOFF /= 2*KF_UV_G+KF_SCALARS_G) THEN + PRINT*, "TRGTOL: ERROR IN IVSET COMPUTATION" + FLUSH(6) + STOP 38 + ENDIF + + IF (LHOOK) CALL DR_HOOK('TRGTOL',0,ZHOOK_HANDLE) + + CALL GSTATS(1805,0) + IOFF=1 + PGP_INDICES(PGP_INDICES_UV) = IOFF + IF (PRESENT(PGPUV)) IOFF=IOFF+UBOUND(PGPUV,2)*2 + PGP_INDICES(PGP_INDICES_GP2) = IOFF + IF (PRESENT(PGP2)) IOFF=IOFF+UBOUND(PGP2,2) + PGP_INDICES(PGP_INDICES_GP3A) = IOFF + IF (PRESENT(PGP3A)) IOFF=IOFF+UBOUND(PGP3A,2)*UBOUND(PGP3A,3) + PGP_INDICES(PGP_INDICES_GP3B) = IOFF + IF (PRESENT(PGP3B)) IOFF=IOFF+UBOUND(PGP3B,2)*UBOUND(PGP3B,3) + PGP_INDICES(PGP_INDICES_END) = IOFF + + ! Prepare sender arrays + ! find number of fields on a certain V-set + IF(NPRTRV == 1) THEN + ! This is needed because IVSET(JFLD) == -1 if there is only one V-set + ISEND_FIELD_COUNT(1) = KF_GP + ELSE + ISEND_FIELD_COUNT(:) = 0 + DO JFLD=1,KF_GP + ISEND_FIELD_COUNT(IVSET(JFLD)) = ISEND_FIELD_COUNT(IVSET(JFLD)) + 1 + ENDDO + ENDIF + ! find number of grid-points on a certain W-set that overlap with myself + ISEND_WSET_SIZE(:) = 0 + DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) + ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1 + ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = & + & ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) + ENDDO + ! sum up offsets + ISEND_WSET_OFFSET(1) = 0 + DO JROC=1,NPRTRW + ISEND_WSET_OFFSET(JROC+1)=ISEND_WSET_OFFSET(JROC)+ISEND_WSET_SIZE(JROC) + ENDDO + DO JROC=1,NPROC + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + ! total send size is # points per field * # fields + ISENDTOT(JROC) = ISEND_WSET_SIZE(ISETW)*ISEND_FIELD_COUNT(ISETV) + ENDDO + + ! Prepare receiver arrays + IRECV_BUFR_TO_OUT_OFFSET(:) = 0 + DO JROC=1,NPROC + ! Get new offset to my current KINDEX entry + IF (JROC > 1 .AND. KF_FS > 0) THEN + IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1)+IRECVTOT(JROC-1)/KF_FS + ELSEIF (JROC > 1) THEN + IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1) + ENDIF + + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + + ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set) + ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver + ! (me, the W-set). Ideally those conincide, at least mostly. + IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA)) + ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set) + ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA)) + + IPOS = 0 + DO JGL=IFIRSTLAT,ILASTLAT + ! get from "actual" latitude to the latitude strip offset + IGL = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA) + ! get from "actual" latitude to the latitude offset + IGLL = JGL-D%NPTRLS(MYSETW)+1 + DO JL=1,D%NONL(IGL,ISETB) + IPOS = IPOS+1 + ! offset to first layer of this gridpoint + IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,1) = & + & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) + ! distance between two layers of this gridpoint + IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,2) = & + & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) + ENDDO + ENDDO + !we always receive the full fourier space + IRECVTOT(JROC) = IPOS*KF_FS + ENDDO + + block + CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& + & int(KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))+1,kind=c_size_t), int(KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1)),kind=c_size_t)) + !!CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL), size1, size2) + end block + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) ASYNC(1) +#endif + + CALL GSTATS(1805,1) + + ! Put data on device for copyin + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(412,0) + ACC_POINTERS_CNT = 0 + IF (PRESENT(PGP)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP) + ENDIF + IF (PRESENT(PGPUV)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGPUV) + ENDIF + IF (PRESENT(PGP2)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP2) + ENDIF + IF (PRESENT(PGP3A)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3A) + ENDIF + IF (PRESENT(PGP3B)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) + ENDIF + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + !$ACC WAIT(1) + IF (PRESENT(PGP)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE DEVICE(PGP) +#endif + ENDIF + IF (PRESENT(PGPUV)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE DEVICE(PGPUV) +#endif + ENDIF + IF (PRESENT(PGP2)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE DEVICE(PGP2) +#endif + ENDIF + IF (PRESENT(PGP3A)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE DEVICE(PGP3A) +#endif + ENDIF + IF (PRESENT(PGP3B)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE DEVICE(PGP3B) +#endif + ENDIF +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA IF(PRESENT(PGP) .AND. KF_GP > 0) PRESENT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) +#endif + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(432,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(432,1) + ENDIF + CALL GSTATS(412,1) + + ! Figure out processes that send or recv something + ISEND_COUNTS = 0 + IRECV_COUNTS = 0 + DO JROC=1,NPROC + IF( JROC /= MYPROC) THEN + IF(IRECVTOT(JROC) > 0) THEN + ! I have to recv something, so let me store that + IRECV_COUNTS = IRECV_COUNTS + 1 + IRECV_TO_PROC(IRECV_COUNTS)=JROC + ENDIF + IF(ISENDTOT(JROC) > 0) THEN + ! I have to send something, so let me store that + ISEND_COUNTS = ISEND_COUNTS+1 + ISEND_TO_PROC(ISEND_COUNTS)=JROC + ENDIF + ENDIF + ENDDO + + ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1)) + ICOMBUFS_OFFSET(1) = 0 + DO JROC=1,ISEND_COUNTS + ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC)) + ENDDO + ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1)) + ICOMBUFR_OFFSET(1) = 0 + DO JROC=1,IRECV_COUNTS + ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) + ENDDO + + IF (ISEND_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& + & 1_C_SIZE_T, int(ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1)),kind=c_size_t)) + ENDIF + + !....Pack loop......................................................... +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA IF(ISEND_COUNTS > 0) PRESENT(ZCOMBUFS) ASYNC(1) +#endif + + CALL GSTATS(1602,0) + DO INS=1,ISEND_COUNTS + ISEND=ISEND_TO_PROC(INS) + CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) + + ISEND_FIELD_COUNT_V = ISEND_FIELD_COUNT(ISETV) + ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) + + IFLDS = 0 + DO JFLD=1,KF_GP + IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS)=KPTRGP(JFLD) + ELSE + IFLDA(IFLDS)=JFLD + ENDIF + ENDIF + ENDDO + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA COPYIN(IFLDA(1:ISEND_FIELD_COUNT_V)) ASYNC(1) +#endif + + ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(ISETW) + ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(ISETW) + IF(PRESENT(PGP)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) & + !$ACC& FIRSTPRIVATE(ISEND_FIELD_COUNT_V,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,& + !$ACC& ICOMBUFS_OFFSET_V,NPROMA) ASYNC(1) +#endif + DO JFLD=1,ISEND_FIELD_COUNT_V + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + JI = (JFLD-1)*ISEND_WSET_SIZE_V+JL + ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP(JK,IFLD,JBLK) + ENDDO + ENDDO + ELSE +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI,IOFF,PBOUND) & + !$ACC& FIRSTPRIVATE(ISEND_FIELD_COUNT_V,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,& + !$ACC& ICOMBUFS_OFFSET_V,NPROMA) ASYNC(1) +#endif + DO JFLD=1,ISEND_FIELD_COUNT_V + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + JI = ICOMBUFS_OFFSET_V+(JFLD-1)*ISEND_WSET_SIZE_V+JL + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + ! TODO we could certainly reshape PGPXX arrays and we would simplify this + ZCOMBUFS(JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + ZCOMBUFS(JI) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + ZCOMBUFS(JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + ZCOMBUFS(JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ENDIF + ENDDO + ENDDO + ENDIF +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA +#endif + ENDDO +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(1602,1) + + IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + + CALL GSTATS(411,0) + IF (IRECV_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& + & 1_C_SIZE_T, int(ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1)),kind=c_size_t)) + ENDIF +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA IF(IRECV_COUNTS > 0) PRESENT(ZCOMBUFR) +#endif + + IR=0 + +#ifdef USE_GPU_AWARE_MPI +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZCOMBUFR,ZCOMBUFS) +#endif +#else + !! this is safe-but-slow fallback for running without GPU-aware MPI + !$ACC UPDATE HOST(ZCOMBUFS) +#endif + ! Receive loop......................................................... + DO INR=1,IRECV_COUNTS + IR=IR+1 + IPROC=IRECV_TO_PROC(INR) + CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IPROC), & + & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) + IREQ(IR) = IREQUEST(IR)%MPI_VAL + ENDDO + + !....Send loop......................................................... + DO INS=1,ISEND_COUNTS + IR=IR+1 + ISEND=ISEND_TO_PROC(INS) + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & + & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) + IREQ(IR) = IREQUEST(IR)%MPI_VAL + ENDDO + + ! Copy local contribution + IF(ISENDTOT(MYPROC) > 0 )THEN + ! I have to send something to myself... + + ! Input is KF_GP fields. We find the resulting KF_FS fields. + IFLDS = 0 + DO JFLD=1,KF_GP + IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS) = KPTRGP(JFLD) + ELSE + IFLDA(IFLDS) = JFLD + ENDIF + ENDIF + ENDDO + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) +#endif + + ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(MYSETW) + ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(MYSETW) + IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(MYPROC) + CALL GSTATS(1601,0) + IF(PRESENT(PGP)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) & + !$ACC& FIRSTPRIVATE(KF_FS,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,& + !$ACC& IRECV_BUFR_TO_OUT_V,NPROMA) ASYNC(1) +#endif + DO JFLD=1,KF_FS + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + PREEL_REAL(IPOS) = PGP(JK,IFLD,JBLK) + ENDDO + ENDDO + ELSE +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF,PBOUND) & + !$ACC& FIRSTPRIVATE(KF_FS,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V, & + !$ACC& IRECV_BUFR_TO_OUT_V,NPROMA) ASYNC(1) +#endif + DO JFLD=1,KF_FS + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + PREEL_REAL(IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + PREEL_REAL(IPOS) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + PREEL_REAL(IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + PREEL_REAL(IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ENDIF + ENDDO + ENDDO + ENDIF + CALL GSTATS(1601,1) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA +#endif + + ENDIF + + + IF(IR > 0) THEN + CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & + & CDSTRING='TRGTOL: WAIT FOR SENDS AND RECEIVES') + ENDIF +#ifdef USE_GPU_AWARE_MPI +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif +#else + !! this is safe-but-slow fallback for running without GPU-aware MPI + !$ACC UPDATE DEVICE(ZCOMBUFR) +#endif + IF (LSYNC_TRANS) THEN + CALL GSTATS(431,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(431,1) + ENDIF + CALL GSTATS(411,1) + + ! Unpack loop......................................................... + + CALL GSTATS(1603,0) + DO INR=1,IRECV_COUNTS + IPROC=IRECV_TO_PROC(INR) + ILEN = IRECVTOT(IPROC)/KF_FS + IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(IPROC) + ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II,IPOS) FIRSTPRIVATE(KF_FS,ILEN, & + !$ACC& IRECV_BUFR_TO_OUT_V,ICOMBUFR_OFFSET_V) ASYNC(1) +#endif + DO JFLD=1,KF_FS + DO JL=1,ILEN + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + PREEL_REAL(IPOS) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) + ENDDO + ENDDO + ENDDO +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(1603,1) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA ! ZCOMBUFR + !$ACC END DATA ! IRECV_BUFR_TO_OUT,PGPINDICES + !$ACC END DATA !ZCOMBUFS (present) + !$ACC END DATA !PGP3B + !$ACC END DATA !PGP3A + !$ACC END DATA !PGP2 + !$ACC END DATA !PGPUV + !$ACC END DATA !PGP +#endif + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + + IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) + END SUBROUTINE TRGTOL +END MODULE TRGTOL_MOD diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 new file mode 100755 index 00000000..22efe3c3 --- /dev/null +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -0,0 +1,916 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 1995- ECMWF. +! (C) Copyright 1995- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TRLTOG_MOD + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRLTOG, TRLTOG_HANDLE, PREPARE_TRLTOG + + TYPE TRLTOG_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFR_AND_COMBUFS + END TYPE +CONTAINS + FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + USE ISO_C_BINDING, ONLY: C_SIZE_T + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP, KF_FS + TYPE(TRLTOG_HANDLE) :: HTRLTOG + + REAL(KIND=JPRBT) :: DUMMY + + INTEGER(KIND=C_SIZE_T) :: NELEM + + NELEM = ALIGN(KF_GP*D%NGPTOT*SIZEOF(DUMMY),128) ! ZCOMBUFR + NELEM = ALIGN(NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY),128) !ZCOMBUFS upper obund + + HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM) + END FUNCTION PREPARE_TRLTOG + + SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KPTRGP,& + & KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2,& + & PGP,PGPUV,PGP3A,PGP3B,PGP2) + + !**** *trltog * - transposition of grid point data from latitudinal + ! to column structure. This takes place between inverse + ! FFT and grid point calculations. + ! TRLTOG is the inverse of TRGTOL + + ! Version using CUDA-aware MPI + + ! Purpose. + ! -------- + + + !** Interface. + ! ---------- + ! *call* *trltog(...) + + ! Explicit arguments : + ! -------------------- + ! PREEL_REAL - Latitudinal data ready for direct FFT (input) + ! PGP - Blocked grid point data (output) + ! KVSET - "v-set" for each field (input) + + ! Implicit arguments : + ! -------------------- + + ! Method. + ! ------- + ! See documentation + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! MPP Group *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 95-10-01 + ! D.Dent : 97-08-04 Reorganisation to allow NPRTRV + ! to differ from NPRGPEW + ! =99-03-29= Mats Hamrud and Deborah Salmond + ! JUMP in FFT's changed to 1 + ! INDEX introduced and ZCOMBUF not used for same PE + ! 01-11-23 Deborah Salmond and John Hague + ! LIMP_NOOLAP Option for non-overlapping message passing + ! and buffer packing + ! 01-12-18 Peter Towers + ! Improved vector performance of LTOG_PACK,LTOG_UNPACK + ! 03-0-02 G. Radnoti: Call barrier always when nproc>1 + ! 08-01-01 G.Mozdzynski: cleanup + ! 09-01-02 G.Mozdzynski: use non-blocking recv and send + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS + USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV + USE PE2SET_MOD ,ONLY : PE2SET + USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML + USE OML_MOD ,ONLY : OML_MY_THREAD + USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + USE MPI_F08 + USE TPM_STATS ,ONLY : GSTATS => GSTATS_NVTX + + USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA + USE ISO_C_BINDING ,ONLY : C_SIZE_T + USE OPENACC_EXT + + IMPLICIT NONE + +#ifdef OMPGPU + include 'mpif.h' +#endif + + + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP + INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G, KF_SCALARS_G + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP(:,:,:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGPUV(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3A(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3B(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP2(:,:,:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) + + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOG_HANDLE) :: HTRLTOG + + ! LOCAL VARIABLES + + REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) + + INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) + INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIM) :: IREQ (NPROC*2) + INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) + INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) + + INTEGER(KIND=JPIM) :: JFLD, J, JI, J1, J2, JGL, JK, JL, IFLDS, JROC, INR, INS + INTEGER(KIND=JPIM) :: IFIRSTLAT, ILASTLAT, IFLD, IGL, IGLL,& + &IPOS, ISETA, ISETB, ISETV, ISEND, IRECV, ISETW, IPROC, & + &IR, ILOCAL_LAT, ISEND_COUNTS, IRECV_COUNTS, IERROR, II, ILEN, IBUFLENS, IBUFLENR, & + &JBLK, ILAT_STRIP + + ! Contains FIELD, PARS, LEVS + INTEGER(KIND=JPIM) :: IGP_OFFSETS(KF_GP,3) + INTEGER(KIND=JPIM), PARAMETER :: IGP_OFFSETS_UV=1, IGP_OFFSETS_GP2=2, IGP_OFFSETS_GP3A=3, IGP_OFFSETS_GP3B=4 + INTEGER(KIND=JPIM) :: IUVPAR,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF,IOFF + + INTEGER(KIND=JPIM) :: IFLDA(KF_GP) + INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF,2),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V + INTEGER(KIND=JPIM) :: IRECV_FIELD_COUNT(NPRTRV),IRECV_FIELD_COUNT_V + INTEGER(KIND=JPIM) :: IRECV_WSET_SIZE(NPRTRW),IRECV_WSET_SIZE_V + INTEGER(KIND=JPIM) :: IRECV_WSET_OFFSET(NPRTRW+1), IRECV_WSET_OFFSET_V + INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + + INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) + INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) + INTEGER(KIND=JPIM) :: IVSET(KF_GP) + INTEGER(KIND=JPIM) :: J3,IFGP2,IFGP3A,IFGP3B + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR + + TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins... + INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT = 0 + + TYPE(MPI_COMM) :: LOCAL_COMM + TYPE(MPI_REQUEST) :: IREQUEST(NPROC*2) + +#ifdef PARKINDTRANS_SINGLE +#define TRLTOG_DTYPE MPI_FLOAT +#else +#define TRLTOG_DTYPE MPI_DOUBLE +#endif + LOCAL_COMM%MPI_VAL = MPL_COMM_OML( OML_MY_THREAD() ) + + ! ------------------------------------------------------------------ + + !* 0. Some initializations + ! -------------------- + IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE) + + ! Note we have either + ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or + ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2) + ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which + ! should match PSPXXX and PGPXXX arrays) + + + ! We first get the decomposition individually + IVSETUV(:) = -1 + IF (PRESENT(KVSETUV)) IVSETUV(:) = KVSETUV(:) + IVSETSC(:)=-1 + IF (PRESENT(KVSETSC)) THEN + IVSETSC(:) = KVSETSC(:) + ELSE + IOFF=0 + IF (PRESENT(KVSETSC2)) THEN + IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC2))=KVSETSC2(:) + IOFF = IOFF+SIZE(KVSETSC2) + ENDIF + IF (PRESENT(KVSETSC3A)) THEN + DO J3=1,MERGE(UBOUND(PGP3A,3),UBOUND(PGP3A,3)/3,.NOT. LSCDERS) + IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) + IOFF=IOFF+SIZE(KVSETSC3A) + ENDDO + ENDIF + IF (PRESENT(KVSETSC3B)) THEN + ! If SCDERS is on, the size of PGP is 3X larger because it is + ! holding various derivatives. The problem is that those are + ! at different non-contiguous positions, hence we treat them + ! as separate fields + DO J3=1,MERGE(UBOUND(PGP3B,3),UBOUND(PGP3B,3)/3,.NOT. LSCDERS) + IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) + IOFF=IOFF+SIZE(KVSETSC3B) + ENDDO + ENDIF + IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN + PRINT*, "TRLTOG: ERROR IN IVSETSC COMPUTATION" + STOP 39 + ENDIF + ENDIF + + ! Now from UV and Scalars decomposition we get the full decomposition + IOFF=0 + IF (KF_UV_G > 0) THEN + IF (LVORGP) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF + IF ( LDIVGP) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF + IF (KF_SCALARS_G > 0) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + IF (LSCDERS) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + ENDIF + ENDIF + IF (KF_UV_G > 0 .AND. LUVDER) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF + IF (KF_SCALARS_G > 0) THEN + IF (LSCDERS) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + ENDIF + ENDIF + + IF (.NOT. PRESENT(PGP)) THEN + ! This is only relevant if we use the split interface (i.e. not PGP) + + IGP2PAR = 0 + IGP3APAR = 0 + IGP3ALEV = 0 + IGP3BPAR = 0 + IGP3BLEV = 0 + IF (PRESENT(PGP2)) THEN + IGP2PAR = UBOUND(PGP2,2) + IF(LSCDERS) IGP2PAR = IGP2PAR/3 + ENDIF + IF (PRESENT(PGP3A)) THEN + IGP3ALEV = UBOUND(PGP3A,2) + IGP3APAR = UBOUND(PGP3A,3) + IF(LSCDERS) IGP3APAR = IGP3APAR/3 + ENDIF + IF (PRESENT(PGP3B)) THEN + IGP3BLEV = UBOUND(PGP3B,2) + IGP3BPAR = UBOUND(PGP3B,3) + IF(LSCDERS) IGP3BPAR = IGP3BPAR/3 + ENDIF + IF (IGP2PAR + IGP3ALEV*IGP3APAR + IGP3BPAR*IGP3BLEV /= KF_SCALARS_G) THEN + PRINT *, IGP2PAR, IGP3APAR, IGP3ALEV, IGP3BPAR, IGP3BLEV + CALL ABORT_TRANS("INCONSISTENCY IN SCALARS") + ENDIF + + ! This is only relevant if we use the split interface (i.e. not PGP) + IUVPAR = 1 + IOFF=1 + IF(LVORGP) THEN + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + ENDIF + + IF(LDIVGP) THEN + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + ENDIF + + ! U + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + + ! V + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + + ! Scalars + ! PGP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J, J=1,IGP2PAR)/) + IOFF=IOFF+IGP2PAR + ! PGP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) + IOFF=IOFF+IGP3APAR*IGP3ALEV + ! PGP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) + IOFF=IOFF+IGP3BPAR*IGP3BLEV + + IF(LSCDERS) THEN + !Scalars NS Derivatives + ! PGP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+IGP2PAR, J=1,IGP2PAR)/) + IOFF=IOFF+IGP2PAR + ! PGP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) + IOFF=IOFF+IGP3APAR*IGP3ALEV + ! PGP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) + IOFF=IOFF+IGP3BPAR*IGP3BLEV + ENDIF + + IF(LUVDER) THEN + ! U Derivative NS + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + + ! V Derivative NS + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + ENDIF + + IF(LSCDERS) THEN + !Scalars NS Derivatives + ! PGP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+2*IGP2PAR, J=1,IGP2PAR)/) + IOFF=IOFF+IGP2PAR + ! PGP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+2*IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) + IOFF=IOFF+IGP3APAR*IGP3ALEV + ! PGP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+2*IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) + IOFF=IOFF+IGP3BPAR*IGP3BLEV + ENDIF + ENDIF + + CALL GSTATS(1806,0) + + ! Prepare receiver arrays + ! find number of fields on a certain V-set + IF(NPRTRV == 1) THEN + ! This is needed because KVSET(JFLD) == -1 if there is only one V-set + IRECV_FIELD_COUNT(1) = KF_GP + ELSE + IRECV_FIELD_COUNT(:) = 0 + DO JFLD=1,KF_GP + IRECV_FIELD_COUNT(IVSET(JFLD)) = IRECV_FIELD_COUNT(IVSET(JFLD)) + 1 + ENDDO + ENDIF + ! find number of grid-points on a certain W-set that overlap with myself + IRECV_WSET_SIZE(:) = 0 + DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) + ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1 + IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = & + & IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) + ENDDO + ! sum up offsets + IRECV_WSET_OFFSET(1) = 0 + DO JROC=1,NPRTRW + IRECV_WSET_OFFSET(JROC+1)=IRECV_WSET_OFFSET(JROC)+IRECV_WSET_SIZE(JROC) + ENDDO + DO JROC=1,NPROC + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + ! total recv size is # points per field * # fields + IRECVTOT(JROC) = IRECV_WSET_SIZE(ISETW)*IRECV_FIELD_COUNT(ISETV) + ENDDO + + ! Prepare sender arrays + IIN_TO_SEND_BUFR_OFFSET(1) = 0 + DO JROC=1,NPROC + ! Get new offset to my current KINDEX entry + IF (JROC > 1 .AND. KF_FS > 0) THEN + IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1)+ISENDTOT(JROC-1)/KF_FS + ELSEIF (JROC > 1) THEN + IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1) + ENDIF + + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + + ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set) + ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver + ! (me, the W-set). Ideally those conincide, at least mostly. + IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA)) + ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set) + ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA)) + + IPOS = 0 + DO JGL=IFIRSTLAT,ILASTLAT + ! get from "actual" latitude to the latitude strip offset + IGL = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA) + ! get from "actual" latitude to the latitude offset + IGLL = JGL-D%NPTRLS(MYSETW)+1 + DO JL=1,D%NONL(IGL,ISETB) + IPOS = IPOS+1 + ! offset to first layer of this gridpoint + IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,1) = & + & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) + ! distance between two layers of this gridpoint + IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,2) = & + & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) + ENDDO + ENDDO + !we always receive the full fourier space + ISENDTOT(JROC) = IPOS*KF_FS + ENDDO + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA COPYIN(IIN_TO_SEND_BUFR,IGP_OFFSETS) ASYNC(1) +#endif + + ACC_POINTERS_CNT = 0 + IF (PRESENT(PGP)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP) + ENDIF + IF (PRESENT(PGPUV)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGPUV) + ENDIF + IF (PRESENT(PGP2)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP2) + ENDIF + IF (PRESENT(PGP3A)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3A) + ENDIF + IF (PRESENT(PGP3B)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) + ENDIF + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) + + ! Present until self contribution and packing are done + !$ACC DATA PRESENT(PREEL_REAL) +#endif +#ifdef OMPGPU +#endif + + CALL GSTATS(1806,1) + + ! Copy local contribution + IF(ISENDTOT(MYPROC) > 0) THEN + ! I have to send something to myself... + + ! Input is KF_GP fields. We find the resulting KF_FS fields. + IFLDS = 0 + DO JFLD=1,KF_GP + IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS) = KPTRGP(JFLD) + ELSE + IFLDA(IFLDS) = JFLD + ENDIF + ENDIF + ENDDO + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) +#endif + + CALL GSTATS(1604,0) + + IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(MYSETW) + IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(MYSETW) + IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(MYPROC) + IF (PRESENT(PGP)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) & + !$ACC& FIRSTPRIVATE(KF_FS,IRECV_WSET_SIZE_V,IRECV_WSET_OFFSET_V, & + !$ACC& IIN_TO_SEND_BUFR_V,NPROMA) ASYNC(1) +#endif + DO JFLD=1,KF_FS + DO JL=1,IRECV_WSET_SIZE_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & + & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 + PGP(JK,IFLD,JBLK) = PREEL_REAL(IPOS) + ENDDO + ENDDO + ELSE +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) & + !$ACC& FIRSTPRIVATE(KF_FS,IRECV_WSET_SIZE_V,IRECV_WSET_OFFSET_V, & + !$ACC& IIN_TO_SEND_BUFR_V,NPROMA) ASYNC(1) +#endif + DO JFLD=1,KF_FS + DO JL=1,IRECV_WSET_SIZE_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & + & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 + IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN + PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN + PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK)=PREEL_REAL(IPOS) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN + PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN + PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) + ENDIF + ENDDO + ENDDO + ENDIF + CALL GSTATS(1604,1) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA +#endif + + ENDIF + + ! Figure out processes that send or recv something + ISEND_COUNTS = 0 + IRECV_COUNTS = 0 + DO JROC=1,NPROC + IF( JROC /= MYPROC) THEN + IF(IRECVTOT(JROC) > 0) THEN + ! I have to recv something, so let me store that + IRECV_COUNTS = IRECV_COUNTS + 1 + IRECV_TO_PROC(IRECV_COUNTS)=JROC + ENDIF + IF(ISENDTOT(JROC) > 0) THEN + ! I have to send something, so let me store that + ISEND_COUNTS = ISEND_COUNTS+1 + ISEND_TO_PROC(ISEND_COUNTS)=JROC + ENDIF + ENDIF + ENDDO + + ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1)) + ICOMBUFS_OFFSET(1) = 0 + DO JROC=1,ISEND_COUNTS + ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC)) + ENDDO + ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1)) + ICOMBUFR_OFFSET(1) = 0 + DO JROC=1,IRECV_COUNTS + ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) + ENDDO + + IF (IRECV_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& + & 1_C_SIZE_T, int(ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1)),kind=c_size_t)) + ENDIF + IF (ISEND_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& + & int(ALIGN(KF_GP*D%NGPTOT*SIZEOF(ZCOMBUFR(1)),128)+1,kind=c_size_t), & + & int(ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1)),kind=c_size_t)) + ENDIF + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA PRESENT(ZCOMBUFS) +#endif + CALL GSTATS(1605,0) + DO INS=1,ISEND_COUNTS + IPROC = ISEND_TO_PROC(INS) + ILEN = ISENDTOT(IPROC)/KF_FS + IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(IPROC) + ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS) FIRSTPRIVATE(KF_FS,ILEN,IIN_TO_SEND_BUFR_V, & + !$ACC& ICOMBUFS_OFFSET_V) COLLAPSE(2) ASYNC(1) +#endif + DO JFLD=1,KF_FS + DO JL=1,ILEN + IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & + & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 + ZCOMBUFS(ICOMBUFS_OFFSET_V+(JFLD-1)*ILEN+JL) = PREEL_REAL(IPOS) + ENDDO + ENDDO + ENDDO + CALL GSTATS(1605,1) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA ! ZCOMBUFS + + !$ACC END DATA ! PREEL_REAL + + !$ACC WAIT(1) +#endif + + CALL GSTATS(805,0) + + IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(421,0) + + IR=0 + !...Receive loop......................................................... +#ifdef USE_GPU_AWARE_MPI +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(ZCOMBUFS,ZCOMBUFR) +#endif +#else + !! this is safe-but-slow fallback for running without GPU-aware MPI + !$ACC UPDATE HOST(ZCOMBUFS) +#endif + DO INR=1,IRECV_COUNTS + IR=IR+1 + IRECV=IRECV_TO_PROC(INR) + CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), & + & IRECVTOT(IRECV), & + & TRLTOG_DTYPE,NPRCIDS(IRECV)-1, & + & MTAGLG, LOCAL_COMM, IREQUEST(IR), & + & IERROR ) + IREQ(IR) = IREQUEST(IR)%MPI_VAL + ENDDO + + !...Send loop......................................................... + DO INS=1,ISEND_COUNTS + IR=IR+1 + ISEND=ISEND_TO_PROC(INS) + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & + & TRLTOG_DTYPE, NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) + IREQ(IR) = IREQUEST(IR)%MPI_VAL + ENDDO + + IF(IR > 0) THEN + CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & + & CDSTRING='TRLTOG: WAIT FOR SENDS AND RECEIVES') + ENDIF + +#ifdef USE_GPU_AWARE_MPI +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif +#else + !! this is safe-but-slow fallback for running without GPU-aware MPI + !$ACC UPDATE DEVICE(ZCOMBUFR) +#endif + + IF (LSYNC_TRANS) THEN + CALL GSTATS(441,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(441,1) + ENDIF + CALL GSTATS(421,1) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA PRESENT(ZCOMBUFR) +#endif + CALL GSTATS(805,1) + + ! Unpack loop......................................................... + + CALL GSTATS(1606,0) + DO INR=1,IRECV_COUNTS + IRECV=IRECV_TO_PROC(INR) + CALL PE2SET(IRECV,ISETA,ISETB,ISETW,ISETV) + + IRECV_FIELD_COUNT_V = IRECV_FIELD_COUNT(ISETV) + ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR) + + IFLDS = 0 + DO JFLD=1,KF_GP + IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS)=KPTRGP(JFLD) + ELSE + IFLDA(IFLDS)=JFLD + ENDIF + ENDIF + ENDDO + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA COPYIN(IFLDA(1:IRECV_FIELD_COUNT_V)) ASYNC(1) +#endif + + IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(ISETW) + IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(ISETW) + IF (PRESENT(PGP)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) & + !$ACC& FIRSTPRIVATE(IRECV_FIELD_COUNT_V,IRECV_WSET_SIZE_V,& + !$ACC& IRECV_WSET_OFFSET_V,NPROMA,ICOMBUFR_OFFSET_V) ASYNC(1) +#endif + DO JFLD=1,IRECV_FIELD_COUNT_V + DO JL=1,IRECV_WSET_SIZE_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD=IFLDA(JFLD) + JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL + PGP(JK,IFLD,JBLK) = ZCOMBUFR(JI) + ENDDO + ENDDO + ELSE +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) & + !$ACC& FIRSTPRIVATE(IRECV_FIELD_COUNT_V,IRECV_WSET_SIZE_V, & + !$ACC& IRECV_WSET_OFFSET_V,NPROMA,ICOMBUFR_OFFSET_V) ASYNC(1) +#endif + DO JFLD=1,IRECV_FIELD_COUNT_V + DO JL=1,IRECV_WSET_SIZE_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD=IFLDA(JFLD) + JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL + IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN + PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN + PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN + PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN + PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ENDIF + ENDDO + ENDDO + ENDIF +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA +#endif + ENDDO + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA ! ZOMBUFR +#endif + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(422,0) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA ! PGP3B + !$ACC END DATA ! PGP3A + !$ACC END DATA ! PGP2 + !$ACC END DATA ! PGPUV + !$ACC END DATA ! PGP +#endif + IF (PRESENT(PGP)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE HOST(PGP) +#endif + ENDIF + IF (PRESENT(PGPUV)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE HOST(PGPUV) +#endif + ENDIF + IF (PRESENT(PGP2)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE HOST(PGP2) +#endif + ENDIF + IF (PRESENT(PGP3A)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE HOST(PGP3A) +#endif + ENDIF + IF (PRESENT(PGP3B)) THEN +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC UPDATE HOST(PGP3B) +#endif + ENDIF + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(442,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(442,1) + ENDIF + CALL GSTATS(422,1) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA ! IRECVBUFR_TO_OUT,PGPINDICES + + !$ACC WAIT(1) +#endif + + CALL GSTATS(1606,1) + + IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) + END SUBROUTINE TRLTOG +END MODULE TRLTOG_MOD + diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 new file mode 100755 index 00000000..45bdf135 --- /dev/null +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -0,0 +1,232 @@ +! (C) Copyright 1995- ECMWF. +! (C) Copyright 1995- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TRLTOM_MOD + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRLTOM, PREPARE_TRLTOM, TRLTOM_HANDLE + + TYPE TRLTOM_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF + END TYPE +CONTAINS + FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + USE ISO_C_BINDING, ONLY: C_SIZE_T + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(TRLTOM_HANDLE) :: HTRLTOM + + REAL(KIND=JPRBT) :: DUMMY + + HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, int(D%NLENGT1B*2*KF_FS*SIZEOF(DUMMY),kind=c_size_t)) + END FUNCTION + + SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) + !**** *TRLTOM * - transposition in Fourierspace + + ! Purpose. + ! -------- + ! Transpose Fourier coefficients from partitioning + ! over latitudes to partitioning over wave numbers + ! This is done between inverse Legendre Transform + ! and inverse FFT. + ! This is the inverse routine of TRMTOL. + + !** Interface. + ! ---------- + ! *CALL* *TRLTOM(...)* + + ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is + ! -------------------- used for both input and output. + + ! KF_FS - Number of fields communicated + + ! Implicit arguments : + ! -------------------- + + ! Method. + ! ------- + ! See documentation + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! MPP Group *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 95-10-01 + ! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use + ! (NCOMBFLEN) for nphase.eq.1 + ! Modified : 99-05-28 D.Salmond - Optimise copies. + ! Modified : 00-02-02 M.Hamrud - Remove NPHASE + ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message + ! passing and buffer packing + ! G.Mozdzynski: 08-01-01 Cleanup + ! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK + USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE MPI_F08 + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE ISO_C_BINDING, ONLY : C_SIZE_T + + IMPLICIT NONE + + INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS + REAL(KIND=JPRBT) ,INTENT(OUT), POINTER :: PFBUF(:) + REAL(KIND=JPRBT) ,INTENT(INOUT), POINTER :: PFBUF_IN(:) + + INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) + INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + INTEGER(KIND=JPIM) :: IERROR + + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOM_HANDLE), INTENT(IN) :: HTRLTOM + TYPE(MPI_COMM) :: LOCAL_COMM + +#ifdef PARKINDTRANS_SINGLE +#define TRLTOM_DTYPE MPI_FLOAT +#else +#define TRLTOM_DTYPE MPI_DOUBLE +#endif + LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM + + IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) + + CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HPFBUF),& + & 1_C_SIZE_T, int(D%NLENGT1B*2*KF_FS*SIZEOF(PFBUF(1)),kind=c_size_t)) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA PRESENT(PFBUF,PFBUF_IN) +#endif + + IF(NPROC > 1) THEN + DO J=1,NPRTRW + ILENS(J) = D%NLTSGTB(J)*2*KF_FS + IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS + ILENR(J) = D%NLTSFTB(J)*2*KF_FS + IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS + ENDDO + + CALL GSTATS(806,0) + + ! copy to self workaround + IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) + IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN + PRINT *, "ERROR", ILENS(IRANK), ILENR(IRANK) + stop 1 + ENDIF + IF (ILENS(IRANK) > 0) THEN + FROM_SEND = IOFFS(IRANK) + 1 + TO_SEND = FROM_SEND + ILENS(IRANK) - 1 + FROM_RECV = IOFFR(IRANK) + 1 + TO_RECV = FROM_RECV + ILENR(IRANK) - 1 +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC KERNELS ASYNC(1) +#endif + PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END KERNELS +#endif + ILENS(IRANK) = 0 + ILENR(IRANK) = 0 + ENDIF + + IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(411,0) +#ifdef USE_GPU_AWARE_MPI +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) +#endif +#else + !! this is safe-but-slow fallback for running without GPU-aware MPI + !$ACC UPDATE HOST(PFBUF_IN,PFBUF) +#endif + CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& + & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, & + & LOCAL_COMM,IERROR) +#ifdef USE_GPU_AWARE_MPI +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif +#else + !! this is safe-but-slow fallback for running without GPU-aware MPI + !$ACC UPDATE DEVICE(PFBUF) +#endif + IF (LSYNC_TRANS) THEN + CALL GSTATS(431,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(431,1) + ENDIF + CALL GSTATS(411,1) + +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(806,1) + ELSE + ILEN = D%NLTSGTB(MYSETW)*2*KF_FS + ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 + CALL GSTATS(1607,0) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP DEFAULT(NONE) FIRSTPRIVATE(ISTA,ILEN) +#endif + DO J=ISTA,ISTA+ILEN-1 + PFBUF(J) = PFBUF_IN(J) + ENDDO + CALL GSTATS(1607,1) + ENDIF + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA +#endif + + IF (LHOOK) CALL DR_HOOK('TRLTOM',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ + END SUBROUTINE TRLTOM +END MODULE TRLTOM_MOD diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 new file mode 100755 index 00000000..b14a0188 --- /dev/null +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -0,0 +1,268 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TRLTOM_PACK_UNPACK + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRLTOM_PACK_HANDLE, PREPARE_TRLTOM_PACK, TRLTOM_PACK + PUBLIC :: TRLTOM_UNPACK_HANDLE, PREPARE_TRLTOM_UNPACK, TRLTOM_UNPACK + + TYPE TRLTOM_PACK_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN + END TYPE + TYPE TRLTOM_UNPACK_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HINPS_AND_ZINPA + END TYPE +CONTAINS + FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + USE ISO_C_BINDING, ONLY: C_SIZE_T + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(TRLTOM_PACK_HANDLE) :: HTRLTOM_PACK + + REAL(KIND=JPRBT) :: DUMMY + + HTRLTOM_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, int(D%NLENGT0B*KF_FS*2*SIZEOF(DUMMY),kind=c_size_t)) + END FUNCTION PREPARE_TRLTOM_PACK + + SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) + !**** *TRLTOM_PACK* - Copy fourier data from local array to buffer + + ! Purpose. + ! -------- + ! Routine for copying fourier data from local array to buffer + + !** Interface. + ! ---------- + ! CALL TRLTOM_PACK(...) + + ! Explicit arguments : PREEL - local fourier/GP array + ! -------------------- KF_FS - number of fields + ! + ! Externals. None. + ! ---------- + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! ------------------------------------------------------------------ + + USE BUFFERED_ALLOCATOR_MOD + USE PARKIND_ECTRANS, ONLY : JPIM,JPRBT + USE TPM_DISTR, ONLY : D,MYSETW,D_NSTAGTF,D_NPNTGTB0,D_NPTRLS,D_NDGL_FS + USE TPM_GEOMETRY, ONLY : G_NMEN,G_NLOEN + USE TPM_DIM, ONLY: R_NSMAX + USE ISO_C_BINDING + ! + + IMPLICIT NONE + + REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: FOUBUF_IN(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOM_PACK_HANDLE), INTENT(IN) :: HTRLTOM_PACK + + INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL + + REAL(KIND=JPRBT) :: SCAL + + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& + & 1_C_SIZE_T, int(D%NLENGT0B*KF_FS*2*SIZEOF(FOUBUF_IN(1)),kind=c_size_t)) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA PRESENT(G_NMEN,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS,G_NLOEN, R_NSMAX) ASYNC(1) +#endif + + ! scale results and move into next transformation buffer + + OFFSET_VAR=D_NPTRLS(MYSETW) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,SCAL) FIRSTPRIVATE(KF_FS,OFFSET_VAR) DEFAULT(NONE) & + !$ACC& ASYNC(1) TILE(32,16,1) +#endif + DO KGL=1,D_NDGL_FS + DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) + DO JF=1,KF_FS + IGLG = OFFSET_VAR+KGL-1 + IF (JM <= G_NMEN(IGLG)) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + ISTA = D_NPNTGTB0(JM,KGL)*KF_FS*2 + + FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) + FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) + ENDIF + ENDDO + ENDDO + ENDDO +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA + + !$ACC WAIT(1) +#endif + END SUBROUTINE TRLTOM_PACK + + FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE LEDIR_MOD, ONLY: LEDIR_STRIDES + USE ISO_C_BINDING, ONLY: C_SIZE_T + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK + + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE + INTEGER(KIND=C_SIZE_T) :: ISIZE + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + REAL(KIND=JPRD) :: ZPRD_DUMMY + + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) + + ! Check if the reuse buffer is large enough + ISIZE = ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) + + HTRLTOM_UNPACK%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) + END FUNCTION PREPARE_TRLTOM_UNPACK + + SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + USE PARKIND_ECTRANS, ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM, ONLY : R_NDGNH, R_NDGL + USE TPM_GEOMETRY, ONLY : G_NDGLU + USE TPM_FIELDS, ONLY : F_RW, F_RACTHE + USE TPM_DISTR, ONLY : D_NUMP,D_MYMS, D_NPNTGTB1,D_OFFSETS_GEMM1 + USE LEDIR_MOD, ONLY : LEDIR_STRIDES + USE, INTRINSIC :: ISO_C_BINDING + + IMPLICIT NONE + + REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF(:) + REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), POINTER, INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOM_UNPACK_HANDLE), INTENT(IN) :: HTRLTOM_UNPACK + + REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) + + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE + + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + + INTEGER(KIND=8) :: JF + INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, KMLOC + + REAL(KIND=JPRBT) :: PAIA, PAIS + + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) + + IALLOC_POS=1 + + IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINPS(0)),128) + CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINPA(0)),128) + CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINPS0(0)),128) + CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINPA0(0)),128) + CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA & + !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & + !$ACC& PRESENT(F_RW,F_RACTHE) & + !$ACC& PRESENT(D_MYMS,D_NUMP,R_NDGNH,R_NDGL,G_NDGLU) & + !$ACC& PRESENT(D_NPNTGTB1) + + !$ACC DATA PRESENT(FOUBUF,D_OFFSETS_GEMM1) + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) & + !$ACC& FIRSTPRIVATE(KF_FS,KF_UV,IIN_STRIDES0,IIN0_STRIDES0) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JGL=1,R_NDGNH + DO JF=1,KF_FS*2 + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = R_NDGL+1-JGL + OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS + OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS + PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) + PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) + IF (JF <= 4*KF_UV) THEN + ! Multiply in case of velocity + PAIA = PAIA*F_RACTHE(JGL) + PAIS = PAIS*F_RACTHE(JGL) + ENDIF + IF (KM /= 0) THEN + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIA*F_RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIS*F_RW(JGL) + ELSEIF (MOD(JF-1,2) == 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 + ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F_RW(JGL) + ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F_RW(JGL) + ENDIF + ENDIF + ENDDO + ENDDO + END DO +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END DATA + + !$ACC END DATA +#endif + END SUBROUTINE TRLTOM_UNPACK + +END MODULE TRLTOM_PACK_UNPACK + diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 new file mode 100755 index 00000000..6d5fc29b --- /dev/null +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -0,0 +1,221 @@ +! (C) Copyright 1995- ECMWF. +! (C) Copyright 1995- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TRMTOL_MOD + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRMTOL, PREPARE_TRMTOL, TRMTOL_HANDLE + + TYPE TRMTOL_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF + END TYPE +CONTAINS + FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + USE ISO_C_BINDING, ONLY: C_SIZE_T + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG + TYPE(TRMTOL_HANDLE) :: HTRMTOL + + REAL(KIND=JPRBT) :: DUMMY + + HTRMTOL%HPFBUF = RESERVE(ALLOCATOR, int(D%NLENGT0B*2*KF_LEG*SIZEOF(DUMMY),kind=c_size_t)) + END FUNCTION + + SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) + !**** *trmtol * - transposition in Fourier space + + ! Purpose. + ! -------- + ! Transpose Fourier buffer data from partitioning + ! over wave numbers to partitioning over latitudes. + ! It is called between direct FFT and direct Legendre + ! transform. + ! This routine is the inverse of TRLTOM. + + + !** Interface. + ! ---------- + ! *call* *trmtol(...)* + + ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is + ! -------------------- used for both input and output. + ! KF_LEG - Number of fields communicated + + ! Implicit arguments : + ! -------------------- + + ! Method. + ! ------- + ! See documentation + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! MPP Group *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 95-10-01 + ! Modified : 97-06-17 G. Mozdzynski - control MPI mailbox use + ! (NCOMBFLEN) for nphase.eq.1 + ! Modified : 99-05-28 D.Salmond - Optimise copies. + ! Modified : 00-02-02 M.Hamrud - Remove NPHASE + ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message + ! passing and buffer packing + ! G.Mozdzynski: 08-01-01 Cleanup + ! Y.Seity : 07-08-31 add barrier synchronisation under LSYNC_TRANS + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK + USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE MPI_F08 + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE ISO_C_BINDING, ONLY: C_SIZE_T + + IMPLICIT NONE + + INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_LEG + REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PFBUF(:) + REAL(KIND=JPRBT), INTENT(IN) :: PFBUF_IN(:) + + INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) + INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + INTEGER(KIND=JPIM) :: IERROR + + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRMTOL_HANDLE), INTENT(IN) :: HTRMTOL + TYPE(MPI_COMM) :: LOCAL_COMM + +#ifdef PARKINDTRANS_SINGLE +#define TRMTOL_DTYPE MPI_FLOAT +#else +#define TRMTOL_DTYPE MPI_DOUBLE +#endif + LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM + + IF (LHOOK) CALL DR_HOOK('TRMTOL',0,ZHOOK_HANDLE) + + CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HPFBUF),& + & 1_C_SIZE_T, int(D%NLENGT0B*2*KF_LEG*SIZEOF(PFBUF(1)),kind=c_size_t)) + + IF(NPROC > 1) THEN + DO J=1,NPRTRW + ILENS(J) = D%NLTSFTB(J)*2*KF_LEG + IOFFS(J) = D%NSTAGT1B(J)*2*KF_LEG + ILENR(J) = D%NLTSGTB(J)*2*KF_LEG + IOFFR(J) = D%NSTAGT0B(J)*2*KF_LEG + ENDDO + + CALL GSTATS(807,0) + + ! copy to self workaround + IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) + IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN + PRINT *, "ERROR", ILENS(IRANK), ILENR(IRANK) + stop 1 + ENDIF + IF (ILENS(IRANK) > 0) THEN + FROM_SEND = IOFFS(IRANK) + 1 + TO_SEND = FROM_SEND + ILENS(IRANK) - 1 + FROM_RECV = IOFFR(IRANK) + 1 + TO_RECV = FROM_RECV + ILENR(IRANK) - 1 +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) +#endif + PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END KERNELS +#endif + ILENS(IRANK) = 0 + ILENR(IRANK) = 0 + ENDIF + + IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(421,0) +#ifdef USE_GPU_AWARE_MPI +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) +#endif +#else + !! this is safe-but-slow fallback for running without GPU-aware MPI + !$ACC UPDATE HOST(PFBUF_IN,PFBUF) +#endif + CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,& + & PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,& + & LOCAL_COMM,IERROR) +#ifdef USE_GPU_AWARE_MPI +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC END HOST_DATA +#endif +#else + !! this is safe-but-slow fallback for running without GPU-aware MPI + !$ACC UPDATE DEVICE(PFBUF) +#endif + IF (LSYNC_TRANS) THEN + CALL GSTATS(441,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(441,1) + ENDIF + CALL GSTATS(421,1) + +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(807,1) + ELSE + ILEN = D%NLTSGTB(MYSETW)*2*KF_LEG + ISTA = D%NSTAGT0B(MYSETW)*2*KF_LEG+1 + CALL GSTATS(1608,0) +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) FIRSTPRIVATE(ISTA,ILEN) +#endif + DO J=ISTA,ISTA+ILEN-1 + PFBUF(J) = PFBUF_IN(J) + ENDDO + CALL GSTATS(1608,1) + ENDIF + + IF (LHOOK) CALL DR_HOOK('TRMTOL',1,ZHOOK_HANDLE) + + ! ------------------------------------------------------------------ + END SUBROUTINE TRMTOL +END MODULE TRMTOL_MOD diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 new file mode 100755 index 00000000..e7076693 --- /dev/null +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -0,0 +1,293 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TRMTOL_PACK_UNPACK + USE BUFFERED_ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRMTOL_PACK, TRMTOL_PACK_HANDLE, PREPARE_TRMTOL_PACK + PUBLIC :: TRMTOL_UNPACK, TRMTOL_UNPACK_HANDLE, PREPARE_TRMTOL_UNPACK + + TYPE TRMTOL_PACK_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN + END TYPE + TYPE TRMTOL_UNPACK_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL + END TYPE + +CONTAINS + FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE TPM_DISTR, ONLY: D + USE ISO_C_BINDING + USE LEINV_MOD + USE BUFFERED_ALLOCATOR_MOD + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG + + TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK + + INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + + IALLOC_SZ = D%NLENGT1B*2*KF_LEG*SIZEOF(ZPRBT_DUMMY) + HTRMTOL_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, int(IALLOC_SZ,kind=c_size_t)) + END FUNCTION + SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG) + + !**** *TRMTOL_PACK* - Packing buffer for TRMTOL + + ! Purpose. + ! -------- + ! Packs data from LTINV outputs into FOUBUF for conversion to fourier space + + !** Interface. + ! ---------- + ! CALL TRMTOL_PACK(...) + + ! Explicit arguments : ZOUTS - symmetric data + ! -------------------- ZOUTA - asymmetric data + ! ZOUTS0 - symmetric data for KMLOC0 + ! ZOUTA0 - asymmetric data for KMLOC0 + ! FOUBUF_IN - output towards TRMTOL + ! KF_LEG - number of fields (we have 2XKF_LEG because complex) + + ! Implicit arguments : None. + ! -------------------- + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Nils Wedi + Mats Hamrud + George Modzynski + ! + ! Modifications. + ! -------------- + ! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: + ! F. Vana 05-Mar-2015 Support for single precision + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB,JPRBT,JPRD + USE YOMHOOK, ONLY : LHOOK,DR_HOOK, JPHOOK + USE TPM_DIM, ONLY : R_NDGNH,R_NDGL + USE TPM_GEOMETRY,ONLY : G_NDGLU + USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS,D_NPNTGTB1,D_OFFSETS_GEMM1 + USE LEINV_MOD, ONLY: LEINV_STRIDES + USE ISO_C_BINDING, ONLY: C_SIZE_T + + IMPLICIT NONE + + + ! DUMMY ARGUMENTS + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRMTOL_PACK_HANDLE), INTENT(IN) :: HTRMTOL_PACK + REAL(KIND=JPRB), INTENT(OUT), POINTER :: FOUBUF_IN(:) + REAL(KIND=JPRBT), INTENT(IN) :: ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), INTENT(IN) :: ZOUTS0(:), ZOUTA0(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG + + ! LOCAL + REAL(KIND=JPRBT) :: ZAOA, ZSOA + + INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS, OFFSET1, OFFSET2 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + + IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',0,ZHOOK_HANDLE) + + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& + & 1_C_SIZE_T, int(D%NLENGT1B*2*KF_LEG*SIZEOF(FOUBUF_IN(1)),kind=c_size_t)) + + CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC DATA PRESENT(D_MYMS,D_NPNTGTB1,D_NUMP,G_NDGLU,R_NDGNH,R_NDGL) & + !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1) +#endif + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) & + !$ACC& FIRSTPRIVATE(KF_LEG,IOUT_STRIDES0,IOUT0_STRIDES0) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JGL=1,R_NDGNH + DO JK=1,2*KF_LEG + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = R_NDGL+1-JGL + OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_LEG + OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_LEG + + IF(KM /= 0) THEN + ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0) + ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0) + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ZSOA = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + ZAOA = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + ELSE + ! Imaginary values of KM=0 is zero, though I don't think we care + ZSOA = 0_JPRBT + ZAOA = 0_JPRBT + ENDIF + + FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA + FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA + ENDIF + ENDDO + ENDDO + ENDDO + +#ifdef OMPGPU +#endif +#ifdef ACCGPU + !$ACC WAIT(1) + + !$ACC END DATA +#endif + + IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',1,ZHOOK_HANDLE) + + END SUBROUTINE TRMTOL_PACK + + FUNCTION PREPARE_TRMTOL_UNPACK(ALLOCATOR,KF_FS) RESULT(HTRMTOL_UNPACK) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + USE ISO_C_BINDING, ONLY: C_SIZE_T + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM) :: KF_FS + + TYPE(TRMTOL_UNPACK_HANDLE) :: HTRMTOL_UNPACK + + REAL(KIND=JPRBT) :: DUMMY + + HTRMTOL_UNPACK%HREEL = RESERVE(ALLOCATOR, int(D%NLENGTF*KF_FS*SIZEOF(DUMMY),kind=c_size_t)) + + END FUNCTION PREPARE_TRMTOL_UNPACK +SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) + +!**** *TRMTOL_UNPACK* - Copy fourier data from buffer to local array + +! Purpose. +! -------- +! Routine for copying fourier data from buffer to local array + +!** Interface. +! ---------- +! CALL TRMTOL_UNPACK(...) + +! Explicit arguments : PREEL_COMPLEX - local fourier/GP array +! -------------------- KF_CURRENT - number of fields that are read (from Legendre space) +! KF_TOTAL - total fields in PREEL ("stride") +! +! Externals. None. +! ---------- + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 2000-04-01 + +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT +USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS,D_NDGL_FS +USE TPM_GEOMETRY ,ONLY : G_NMEN,G_NLOEN,G_NLOEN_MAX +USE ISO_C_BINDING ,ONLY : C_SIZE_T +! + +IMPLICIT NONE + +REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF(:) +REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) +INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL +TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR +TYPE(TRMTOL_UNPACK_HANDLE), INTENT(IN) :: HTRMTOL_UNPACK + +INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL +REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX + +CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),& + & 1_C_SIZE_T, int(KF_TOTAL*D%NLENGTF*SIZEOF(PREEL_COMPLEX(1)),kind=c_size_t)) + +#ifdef OMPGPU +#endif +#ifdef ACCGPU +!$ACC DATA PRESENT(G_NLOEN,G_NMEN,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) ASYNC(1) +#endif + +OFFSET_VAR=D_NPTRLS(MYSETW) +#ifdef OMPGPU +#endif +#ifdef ACCGPU +!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,RET_REAL,RET_COMPLEX) FIRSTPRIVATE(KF_CURRENT,& +!$ACC& KF_TOTAL,OFFSET_VAR,G_NLOEN_MAX) DEFAULT(NONE) & +!$ACC& ASYNC(1) TILE(32,16,1) +#endif +DO KGL=1,D_NDGL_FS + DO JF=1,KF_CURRENT + DO JM=0,G_NLOEN_MAX/2 + IGLG = OFFSET_VAR+KGL-1 + + ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have + ! to fill those floor(NLON/2)+1 values. + ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. + IF (JM <= G_NLOEN(IGLG)/2) THEN + RET_REAL = 0.0_JPRBT + RET_COMPLEX = 0.0_JPRBT + IF (JM <= G_NMEN(IGLG)) THEN + ISTA = D_NPNTGTB0(JM,KGL)*KF_CURRENT*2 + + RET_REAL = FOUBUF(ISTA+2*JF-1) + RET_COMPLEX = FOUBUF(ISTA+2*JF ) + ENDIF + IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + PREEL_COMPLEX(IOFF_LAT+2*JM+1) = RET_REAL + PREEL_COMPLEX(IOFF_LAT+2*JM+2) = RET_COMPLEX + ENDIF + ENDDO + ENDDO +ENDDO +#ifdef OMPGPU +#endif +#ifdef ACCGPU +!$ACC END DATA + +!$ACC WAIT(1) +#endif + +END SUBROUTINE TRMTOL_UNPACK +END MODULE TRMTOL_PACK_UNPACK + diff --git a/src/trans/gpu/internal/updsp_mod.F90 b/src/trans/gpu/internal/updsp_mod.F90 new file mode 100755 index 00000000..3c1209f9 --- /dev/null +++ b/src/trans/gpu/internal/updsp_mod.F90 @@ -0,0 +1,164 @@ +! (C) Copyright 1988- ECMWF. +! (C) Copyright 1988- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE UPDSP_MOD +CONTAINS +SUBROUTINE UPDSP(KF_UV,KF_SCALARS,POA1, & + & PSPSCALAR,& + & PSPSC3A,PSPSC3B,PSPSC2 , & + & KFLDPTRUV,KFLDPTRSC) + +!**** *UPDSP* - Update spectral arrays after direct Legendre transform + +! Purpose. +! -------- +! To update the spectral arrays for a fixed zonal wave-number +! from values in POA1 and POA2. + +!** Interface. +! ---------- +! CALL UPDSP(...) + +! Explicit arguments : +! -------------------- +! KM - zonal wave-number +! POA1 - spectral fields for zonal wavenumber KM (basic var.) +! PSPSCALAR - spectral scalar variables + +! Implicit arguments : +! -------------------- + +! Method. +! ------- + +! Externals. UPDSPB - basic transfer routine +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! Mats Hamrud and Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 88-02-02 +! Modified : 91-07-01 Philippe Courtier/Mats Hamrud - Rewrite +! for uv formulation +! Modified : 94-08-02 R. El Khatib - interface to UPDSPB +! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div +! instead of u,v->vor,div +! MPP Group: 95-10-01 Support for Distributed Memory version +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT + +USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B +USE TPM_DISTR ,ONLY : D + +USE UPDSPB_MOD ,ONLY : UPDSPB + +IMPLICIT NONE + + +! DUMMY INTEGER SCALARS + +INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV,KF_SCALARS + +REAL(KIND=JPRBT) , INTENT(IN) :: POA1(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) +REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC2(:,:) +REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3A(:,:,:) +REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3B(:,:,:) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) +INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) + +! LOCAL INTEGER SCALARS +INTEGER(KIND=JPIM) :: IST ,IEND,JFLD,IFLD,IDIM1,IDIM3,J3 + + +! ------------------------------------------------------------------ + +!* 1. UPDATE FIELDS +! ------------- + +!* 1.1 VORTICITY AND DIVERGENCE. + +#ifdef ACCGPU +!$ACC DATA PRESENT(PSPSCALAR) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSCALAR)) +!$ACC DATA PRESENT(PSPSC2) IF(NF_SC2 > 0) +!$ACC DATA PRESENT(PSPSC3A) IF(NF_SC3A > 0) +!$ACC DATA PRESENT(PSPSC3B) IF(NF_SC3B > 0) +#endif +#ifdef OMPGPU +!WARNING: following lines should be PRESENT,ALLOC but cause issues with AMD compiler! +!$OMP TARGET DATA MAP(ALLOC:PSPSCALAR) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSCALAR)) +!$OMP TARGET DATA MAP(ALLOC:PSPSC2) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC2) .AND. NF_SC2 > 0) +!$OMP TARGET DATA MAP(ALLOC:PSPSC3A) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3A) .AND. NF_SC3A > 0) +!$OMP TARGET DATA MAP(ALLOC:PSPSC3B) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3B) .AND. NF_SC3B > 0) +#endif + +IST = 1 +IST = IST+4*KF_UV + +!* 1.2 SCALARS + +IF (KF_SCALARS > 0) THEN + IF(PRESENT(PSPSCALAR)) THEN + IEND = IST+2*KF_SCALARS-1 + CALL UPDSPB(KF_SCALARS,POA1(IST:IEND,:,:),PSPSCALAR,KFLDPTRSC) + ELSE + IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN + IDIM1 = NF_SC2 + IEND = IST+2*IDIM1-1 + CALL UPDSPB(IDIM1,POA1(IST:IEND,:,:),PSPSC2) + IST=IST+2*IDIM1 + ENDIF + IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN + IDIM1=NF_SC3A + IDIM3=UBOUND(PSPSC3A,3) + DO J3=1,IDIM3 + IEND = IST+2*IDIM1-1 + CALL UPDSPB(IDIM1,POA1(IST:IEND,:,:),PSPSC3A(:,:,J3)) + IST=IST+2*IDIM1 + ENDDO + ENDIF + IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN + IDIM1=NF_SC3B + IDIM3=UBOUND(PSPSC3B,3) + DO J3=1,IDIM3 + IEND = IST+2*IDIM1-1 + CALL UPDSPB(IDIM1,POA1(IST:IEND,:,:),PSPSC3B(:,:,J3)) + IST=IST+2*IDIM1 + ENDDO + ENDIF + ENDIF +ENDIF + +#ifdef OMPGPU +!$OMP END TARGET DATA +!$OMP END TARGET DATA +!$OMP END TARGET DATA +!$OMP END TARGET DATA +#endif +#ifdef ACCGPU +!$ACC END DATA +!$ACC END DATA +!$ACC END DATA +!$ACC END DATA +#endif + +! ------------------------------------------------------------------ + +END SUBROUTINE UPDSP +END MODULE UPDSP_MOD diff --git a/src/trans/gpu/internal/updspb_mod.F90 b/src/trans/gpu/internal/updspb_mod.F90 new file mode 100755 index 00000000..79c5326e --- /dev/null +++ b/src/trans/gpu/internal/updspb_mod.F90 @@ -0,0 +1,143 @@ +! (C) Copyright 1988- ECMWF. +! (C) Copyright 1988- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE UPDSPB_MOD + CONTAINS + SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) + + + !**** *UPDSPB* - Update spectral arrays after direct Legendre transform + + ! Purpose. + ! -------- + ! To update spectral arrays for a fixed zonal wave-number + ! from values in POA. + + !** Interface. + ! ---------- + ! CALL UPDSPB(....) + + ! Explicit arguments : KM - zonal wavenumber + ! -------------------- KFIELD - number of fields + ! POA - work array + ! PSPEC - spectral array + + ! Implicit arguments : None + ! -------------------- + + ! Method. + ! ------- + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Mats Hamrud and Philippe Courtier *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 88-02-02 + ! D. Giard : 93-03-19 truncations NSMAX and NTMAX (see NOTE) + ! R. El Khatib : 94-08-02 Replace number of fields by indexes of the + ! first and last field + ! L. Isaksen : 95-06-06 Reordering of spectral arrays + ! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT + + USE TPM_DIM ,ONLY : R_NTMAX + !USE TPM_FIELDS + USE TPM_DISTR ,ONLY : D_NUMP,D_MYMS,D_NASM0 + ! + + IMPLICIT NONE + + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + REAL(KIND=JPRBT) ,INTENT(IN) :: POA(:,:,:) + REAL(KIND=JPRB) ,INTENT(OUT) :: PSPEC(:,:) + INTEGER(KIND=JPIM),INTENT(IN),OPTIONAL :: KFLDPTR(:) + + ! LOCAL INTEGER SCALARS + INTEGER(KIND=JPIM) :: KM,KMLOC + INTEGER(KIND=JPIM) :: II, INM, IR, JFLD, JN, ISMAX, ITMAX, IASM0,IFLD + + + ! ------------------------------------------------------------------ + + !* 0. NOTE. + ! ----- + + ! The following transfer reads : + ! SPEC(k,NASM0(m)+NLTN(n)*2) =POA(nn,2*k-1) (real part) + ! SPEC(k,NASM0(m)+NLTN(n)*2+1)=POA(nn,2*k ) (imaginary part) + ! with n from m to NSMAX + ! and nn=NTMAX+2-n from NTMAX+2-m to NTMAX+2-NSMAX. + ! NLTN(m)=NTMAX+2-m : n=NLTN(nn),nn=NLTN(n) + ! nn is the loop index. + + IF(PRESENT(KFLDPTR)) THEN + stop 'Error: code path not (yet) supported in GPU version' + ENDIF + + !* 1. UPDATE SPECTRAL FIELDS. + ! ----------------------- + + !loop over wavenumber +#ifdef ACCGPU + !$ACC DATA PRESENT(PSPEC,POA,R_NTMAX,D_NUMP,D_MYMS,D_NASM0) ASYNC(1) +#endif +#ifdef OMPGPU +!WARNING: following line should be PRESENT,ALLOC but causes issues with AMD compiler! + !$OMP TARGET DATA MAP(ALLOC:PSPEC,POA) & + !$OMP& MAP(TO:R_NTMAX,D_NUMP,D_MYMS,D_NASM0) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,IASM0,INM,IR,II) DEFAULT(NONE) & + !$OMP& SHARED(R_NTMAX,D_NUMP,D_MYMS,D_NASM0,PSPEC,KFIELD,POA) +#endif +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,IASM0,INM) DEFAULT(NONE) & + !$ACC& FIRSTPRIVATE(KFIELD) ASYNC(1) +#endif + DO KMLOC=1,D_NUMP + DO JN=3,R_NTMAX+3 + DO JFLD=1,KFIELD + KM = D_MYMS(KMLOC) + IASM0 = D_NASM0(KM) + + IF(KM /= 0 .AND. JN <= R_NTMAX+3-KM) THEN + !(DO JN=3,R_NTMAX+3-KM) + INM = IASM0+((R_NTMAX+3-JN)-KM)*2 + PSPEC(JFLD,INM) = POA(2*JFLD-1,JN,KMLOC) + PSPEC(JFLD,INM+1) = POA(2*JFLD ,JN,KMLOC) + ELSEIF (KM == 0) THEN + !(DO JN=3,R_NTMAX+3) + INM = IASM0+(R_NTMAX+3-JN)*2 + PSPEC(JFLD,INM) = POA(2*JFLD-1,JN,KMLOC) + PSPEC(JFLD,INM+1) = 0.0_JPRBT + END IF + ENDDO + ENDDO + ENDDO +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END DATA +#endif + + ! ------------------------------------------------------------------ + + END SUBROUTINE UPDSPB +END MODULE UPDSPB_MOD diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 new file mode 100755 index 00000000..e234533d --- /dev/null +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -0,0 +1,177 @@ +! (C) Copyright 1991- ECMWF. +! (C) Copyright 1991- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE UVTVD_MOD +CONTAINS +SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) + +!**** *UVTVD* - Compute vor/div from u and v in spectral space + +! Purpose. +! -------- +! To compute vorticity and divergence from u and v in spectral +! space. Input u and v from KM to NTMAX+1, output vorticity and +! divergence from KM to NTMAX. + +!** Interface. +! ---------- +! CALL UVTVD(KM,KF_UV,PEPSNM,PU,PV,PVOR,PDIV) + +! Explicit arguments : KM - zonal wave-number +! -------------------- KF_UV - number of fields (levels) +! PEPSNM - REPSNM for wavenumber KM +! PU - u wind component for zonal +! wavenumber KM +! PV - v wind component for zonal +! wavenumber KM +! PVOR - vorticity for zonal +! wavenumber KM +! PDIV - divergence for zonal +! wavenumber KM + + +! Method. See ref. +! ------- + +! Externals. None. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! Mats Hamrud and Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 91-07-01 +! D. Giard : NTMAX instead of NSMAX +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + +USE TPM_DIM ,ONLY : R, R_NTMAX +USE TPM_FIELDS ,ONLY : F_RN +USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS +USE TPM_FIELDS ,ONLY : ZEPSNM +! + +IMPLICIT NONE + +! DUMMY INTEGER SCALARS +INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV +REAL(KIND=JPRBT), INTENT(OUT) :: PVOR(:,:,:),PDIV(:,:,:) +REAL(KIND=JPRBT), INTENT(INOUT) :: PU (:,:,:),PV (:,:,:) +INTEGER(KIND=JPIM) :: KM, KMLOC + +! LOCAL INTEGER SCALARS +INTEGER(KIND=JPIM) :: II, IN, IR, J, JN, ITMAX + +! LOCAL REAL SCALARS +REAL(KIND=JPRBT) :: ZKM,ZJN + +! ------------------------------------------------------------------ + +!* 1. COMPUTE U V FROM VORTICITY AND DIVERGENCE. +! ------------------------------------------ + +#ifdef ACCGPU +!$ACC DATA & +!$ACC& PRESENT(D_MYMS,D_NUMP,R_NTMAX) & +!$ACC& PRESENT(ZEPSNM,PU,PV,PVOR,PDIV) ASYNC(1) +#endif +#ifdef OMPGPU +!WARNING: following line should be PRESENT,ALLOC but causes issues with AMD compiler! +!$OMP TARGET DATA& +!$OMP& MAP(TO:D_MYMS,D_NUMP,R_NTMAX) & +!$OMP& MAP(ALLOC:ZEPSNM,PU,PV,PVOR,PDIV) +#endif + +!* 1.1 SET N=KM-1 COMPONENT TO 0 FOR U AND V + +#ifdef OMPGPU +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) +#endif +#ifdef ACCGPU +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM) DEFAULT(NONE) & +!$ACC& FIRSTPRIVATE(KF_UV) ASYNC(1) +#endif +DO KMLOC=1,D_NUMP + DO J=1,2*KF_UV + KM = D_MYMS(KMLOC) + PU(J,R_NTMAX+4-KM,KMLOC) = 0.0_JPRBT + PV(J,R_NTMAX+4-KM,KMLOC) = 0.0_JPRBT + ENDDO +ENDDO + +!* 1.2 COMPUTE VORTICITY AND DIVERGENCE. + +#ifdef OMPGPU +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IR,II,IN,KM,ZKM,JN,ZJN) DEFAULT(NONE) & +!$OMP& SHARED(D_NUMP,R_NTMAX,KF_UV,D_MYMS,PVOR,PV,PU,PDIV,ZEPSNM) +#endif +#ifdef ACCGPU +!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IR,II,IN,KM,ZKM,JN,ZJN) DEFAULT(NONE) & +!$ACC& FIRSTPRIVATE(KF_UV) ASYNC(1) +#endif +DO KMLOC=1,D_NUMP + DO JN=0,R_NTMAX + DO J=1,KF_UV + IR = 2*J-1 + II = IR+1 + KM = D_MYMS(KMLOC) + ZKM = REAL(KM,JPRBT) + + IF(KM /= 0 .AND. JN >= KM) THEN + ! (DO JN=KN,R_NTMAX) + IN = R_NTMAX+3-JN + ZJN = JN + + PVOR(IR,IN,KMLOC) = -ZKM*PV(II,IN,KMLOC)-& + &ZJN*ZEPSNM(KMLOC,JN+1)*PU(IR,IN-1,KMLOC)+& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PU(IR,IN+1,KMLOC) + PVOR(II,IN,KMLOC) = +ZKM*PV(IR,IN,KMLOC)-& + &ZJN*ZEPSNM(KMLOC,JN+1)*PU(II,IN-1,KMLOC)+& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PU(II,IN+1,KMLOC) + PDIV(IR,IN,KMLOC) = -ZKM*PU(II,IN,KMLOC)+& + &ZJN*ZEPSNM(KMLOC,JN+1)*PV(IR,IN-1,KMLOC)-& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PV(IR,IN+1,KMLOC) + PDIV(II,IN,KMLOC) = +ZKM*PU(IR,IN,KMLOC)+& + &ZJN*ZEPSNM(KMLOC,JN+1)*PV(II,IN-1,KMLOC)-& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PV(II,IN+1,KMLOC) + + ELSEIF(KM == 0) THEN + ! (DO JN=0,R_NTMAX) + IN = R_NTMAX+3-JN + ZJN = JN + + PVOR(IR,IN,KMLOC) = -& + &ZJN*ZEPSNM(KMLOC,JN+1)*PU(IR,IN-1,KMLOC)+& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PU(IR,IN+1,KMLOC) + PDIV(IR,IN,KMLOC) = & + &ZJN*ZEPSNM(KMLOC,JN+1)*PV(IR,IN-1,KMLOC)-& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PV(IR,IN+1,KMLOC) + ENDIF + ENDDO + ENDDO +ENDDO +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif +#ifdef ACCGPU +!$ACC END DATA +#endif +! ------------------------------------------------------------------ + +END SUBROUTINE UVTVD +END MODULE UVTVD_MOD diff --git a/src/trans/gpu/internal/vd2uv_ctl_mod.F90 b/src/trans/gpu/internal/vd2uv_ctl_mod.F90 new file mode 100755 index 00000000..b875505e --- /dev/null +++ b/src/trans/gpu/internal/vd2uv_ctl_mod.F90 @@ -0,0 +1,81 @@ +! (C) Copyright 2015- ECMWF. +! (C) Copyright 2015- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE VD2UV_CTL_MOD +CONTAINS +SUBROUTINE VD2UV_CTL(KF_UV,PSPVOR,PSPDIV,PU,PV) + +!**** *VD2UV_CTL* - Control routine for going from vor/div to spectral U and V. + +! Purpose. +! -------- +! Control routine for computing spectral U (u*cos(theta)) and V + +!** Interface. +! ---------- +! CALL INV_TRANS_CTL(...) +! KF_UV - local number of spectral u-v fields +! PSPVOR(:,:) - spectral vorticity (input) +! PSPDIV(:,:) - spectral divergence (input) +! PU(:,:) - U (out) +! PV(:,:) - V (out) + +! Method. +! ------- + +! Externals. +! ---------- + + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : July 2015 + +! ------------------------------------------------------------------ + +USE PARKIND1 ,ONLY : JPIM ,JPRB + +USE TPM_DISTR ,ONLY : D + +USE VD2UV_MOD ,ONLY : VD2UV + +IMPLICIT NONE + +INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV +REAL(KIND=JPRB),INTENT(IN) :: PSPVOR(:,:) +REAL(KIND=JPRB),INTENT(IN) :: PSPDIV(:,:) +REAL(KIND=JPRB),INTENT(OUT) :: PU(:,:) +REAL(KIND=JPRB),INTENT(OUT) :: PV(:,:) + +INTEGER(KIND=JPIM) :: JM,IM,ILEI2 + +! ------------------------------------------------------------------ + +CALL GSTATS(102,0) +ILEI2 = 8*KF_UV + +CALL GSTATS(1647,0) +!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JM,IM) +DO JM=1,D%NUMP + IM = D%MYMS(JM) + CALL VD2UV(IM,JM,KF_UV,ILEI2,PSPVOR,PSPDIV,PU,PV) +ENDDO +!$OMP END PARALLEL DO +CALL GSTATS(1647,1) +CALL GSTATS(102,1) + +! ------------------------------------------------------------------ + +END SUBROUTINE VD2UV_CTL +END MODULE VD2UV_CTL_MOD diff --git a/src/trans/gpu/internal/vd2uv_mod.F90 b/src/trans/gpu/internal/vd2uv_mod.F90 new file mode 100755 index 00000000..0806699b --- /dev/null +++ b/src/trans/gpu/internal/vd2uv_mod.F90 @@ -0,0 +1,157 @@ +! (C) Copyright 2015- ECMWF. +! (C) Copyright 2015- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE VD2UV_MOD +CONTAINS +SUBROUTINE VD2UV(KM,KMLOC,KF_UV,KLEI2,PSPVOR,PSPDIV,PU,PV) + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + +USE TPM_CONSTANTS +USE TPM_DIM ,ONLY : R +USE TPM_DISTR ,ONLY : D + +USE PREPSNM_MOD ,ONLY : PREPSNM +USE PRFI1B_MOD ,ONLY : PRFI1B +USE VDTUV_MOD ,ONLY : VDTUV + + +!**** *VD2UV* - U and V from Vor/div +! +! Purpose. +! -------- +! +!** Interface. +! ---------- +! *CALL* *VD2UV(...) + +! Explicit arguments : +! -------------------- +! KM - zonal wavenumber +! KMLOC - local zonal wavenumber +! PSPVOR - spectral vorticity +! PSPDIV - spectral divergence +! PU(:,:) - spectral U (out) +! PV(:,:) - spectral V (out) + + +! Implicit arguments : + +! Method. +! ------- + +! Externals. +! ---------- + +! PREPSNM - prepare REPSNM for wavenumber KM +! PRFI1B - prepares the spectral fields +! VDTUV - compute u and v from vorticity and divergence + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS +! Temperton, 1991, MWR 119 p1303 + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : July 2015 +! +! ------------------------------------------------------------------ + +IMPLICIT NONE + +INTEGER(KIND=JPIM), INTENT(IN) :: KM +INTEGER(KIND=JPIM), INTENT(IN) :: KMLOC +INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV +INTEGER(KIND=JPIM), INTENT(IN) :: KLEI2 + +REAL(KIND=JPRB) , INTENT(IN) :: PSPVOR(:,:) +REAL(KIND=JPRB) , INTENT(IN) :: PSPDIV(:,:) +REAL(KIND=JPRB) , INTENT(OUT) :: PU(:,:) +REAL(KIND=JPRB) , INTENT(OUT) :: PV(:,:) + +REAL(KIND=JPRB) :: ZIA(R%NLEI1,KLEI2) +REAL(KIND=JPRBT) :: ZEPSNM(0:R%NTMAX+2),ZA_R + +INTEGER(KIND=JPIM) :: IFC, ISTA, IIFC, IDGLU, JGL, JFLD,ILCM +INTEGER(KIND=JPIM) :: IVORL,IVORU,IDIVL,IDIVU,IUL,IUU,IVL,IVU,ISL,II,IR,INM,J +INTEGER(KIND=JPIM) :: IFIRST, ILAST, IOFF + +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + +! ------------------------------------------------------------------ + +!* 1. PERFORM LEGENDRE TRANFORM. +! -------------------------- + +IF (LHOOK) CALL DR_HOOK('VD2UV_MOD',0,ZHOOK_HANDLE) + +! ------------------------------------------------------------------ + +!* 1. PREPARE ZEPSNM. +! --------------- + +stop 'Error: code path not (yet) supported in GPU version' +!CALL PREPSNM(KM,KMLOC,ZEPSNM) + +! ------------------------------------------------------------------ + + +!* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. +! ---------------------------------------------- + +IFIRST = 1 +ILAST = 4*KF_UV + +IF (KF_UV > 0) THEN + IVORL = 1 + IVORU = 2*KF_UV + IDIVL = 2*KF_UV+1 + IDIVU = 4*KF_UV + IUL = 4*KF_UV+1 + IUU = 6*KF_UV + IVL = 6*KF_UV+1 + IVU = 8*KF_UV + stop 'Error: code path not (yet) supported in GPU version' + !CALL PRFI1B(KM,ZIA(:,IVORL:IVORU),PSPVOR,KF_UV) + !CALL PRFI1B(KM,ZIA(:,IDIVL:IDIVU),PSPDIV,KF_UV) + + !CALL VDTUV(KM,KF_UV,ZEPSNM,ZIA(:,IVORL:IVORU),ZIA(:,IDIVL:IDIVU),& + ! & ZIA(:,IUL:IUU),ZIA(:,IVL:IVU)) + ILCM = R%NSMAX+1-KM + IOFF = D%NASM0(KM) + ZA_R = 1.0_JPRBT/RA + DO J=1,ILCM + INM = IOFF+(ILCM-J)*2 + DO JFLD=1,KF_UV + IR = 2*(JFLD-1)+1 + II = IR+1 + PU(JFLD,INM ) = ZIA(J+2,IR+IUL-1)*ZA_R + PU(JFLD,INM+1) = ZIA(J+2,II+IUL-1)*ZA_R + PV(JFLD,INM ) = ZIA(J+2,IR+IVL-1)*ZA_R + PV(JFLD,INM+1) = ZIA(J+2,II+IVL-1)*ZA_R + ENDDO + ENDDO +ENDIF + +IF (LHOOK) CALL DR_HOOK('VD2UV_MOD',1,ZHOOK_HANDLE) +! ------------------------------------------------------------------ + +END SUBROUTINE VD2UV +END MODULE VD2UV_MOD + + + + diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 new file mode 100755 index 00000000..2873adf4 --- /dev/null +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -0,0 +1,165 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2000- Meteo-France. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE VDTUV_MOD +CONTAINS +SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT + +USE TPM_DIM ,ONLY : R, R_NTMAX +USE TPM_FIELDS ,ONLY : F, F_RLAPIN, F_RN +USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS +USE TPM_GEN ,ONLY : NOUT + + +!**** *VDTUV* - Compute U,V in spectral space + +! Purpose. +! -------- +! In Laplace space compute the the winds +! from vorticity and divergence. + +!** Interface. +! ---------- +! CALL VDTUV(...) + +! Explicit arguments : KM -zonal wavenumber (input-c) +! -------------------- KFIELD - number of fields (input-c) +! PEPSNM - REPSNM for wavenumber KM (input-c) +! PVOR(NLEI1,2*KFIELD) - vorticity (input) +! PDIV(NLEI1,2*KFIELD) - divergence (input) +! PU(NLEI1,2*KFIELD) - u wind (output) +! PV(NLEI1,2*KFIELD) - v wind (output) +! Organisation within NLEI1: +! NLEI1 = NSMAX+4+mod(NSMAX+4+1,2) +! overdimensioning +! 1 : n=NSMAX+2 +! 2 : n=NSMAX+1 +! 3 : n=NSMAX +! . : +! . : +! NSMAX+3 : n=0 +! NSMAX+4 : n=-1 + +! Implicit arguments : Eigenvalues of inverse Laplace operator +! -------------------- from YOMLAP + +! Method. +! ------- + +! Externals. None. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS +! Temperton, 1991, MWR 119 p1303 + +! Author. +! ------- +! Mats Hamrud and Philippe Courtier *ECMWF* + +! Modifications. +! -------------- +! Original : 00-02-01 From VDTUV in IFS CY22R1 + +! ------------------------------------------------------------------ + +IMPLICIT NONE + +INTEGER(KIND=JPIM) :: KM, kmloc +INTEGER(KIND=JPIM), INTENT(IN) :: KFIELD +REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(1:D%NUMP,0:R%NTMAX+2) +REAL(KIND=JPRB), INTENT(INOUT) :: PVOR(:,:,:),PDIV(:,:,:) +REAL(KIND=JPRB), INTENT(OUT) :: PU (:,:,:),PV (:,:,:) + +! LOCAL INTEGER SCALARS +INTEGER(KIND=JPIM) :: II, IJ, IR, J, JN, JI + +! LOCAL REAL SCALARS +REAL(KIND=JPRBT) :: ZKM + +#ifdef ACCGPU +!$ACC DATA & +!$ACC& PRESENT(R_NTMAX,D_MYMS,D_NUMP,F_RLAPIN,F_RN) & +!$ACC& PRESENT(PEPSNM, PVOR, PDIV) & +!$ACC& PRESENT(PU, PV) +#endif +#ifdef OMPGPU +!$OMP TARGET DATA & +!$OMP& MAP (PRESENT,ALLOC:ZEPSNM, ZN, ZLAPIN) & +!$OMP& MAP (TO:R_NSMAX, D_MYMS,D_NUMP,F_RLAPIN,F_RN) & +!$OMP& MAP(PRESENT,ALLOC:ZEPSNM, PVOR, PDIV) & +!$OMP& MAP(PRESENT,ALLOC:PU, PV) +#endif + +! ------------------------------------------------------------------ + +!* 1. COMPUTE U V FROM VORTICITY AND DIVERGENCE. +! ------------------------------------------ + +#ifdef OMPGPU +#endif +#ifdef ACCGPU +!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IR,II,KM,ZKM,JI) & +!$ACC& FIRSTPRIVATE(KFIELD,KMLOC) ASYNC(1) +#endif +DO KMLOC=1,D_NUMP + DO JN=0,R_NTMAX+1 + DO J=1,KFIELD + IR = 2*J-1 + II = IR+1 + KM = D_MYMS(KMLOC) + ZKM = REAL(KM,JPRBT) + + IF(KM /= 0 .AND. JN >= KM) THEN + ! (DO JN=KN,R_NTMAX) + JI = R_NTMAX+3-JN + PU(IR,JI,KMLOC) = -ZKM*F_RLAPIN(JN)*PDIV(II,JI,KMLOC)+& + &(JN-1)*PEPSNM(KMLOC,JN)*F_RLAPIN(JN-1)*PVOR(IR,JI+1,KMLOC)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F_RLAPIN(JN+1)*PVOR(IR,JI-1,KMLOC) + PU(II,JI,KMLOC) = +ZKM*F_RLAPIN(JN)*PDIV(IR,JI,KMLOC)+& + &(JN-1)*PEPSNM(KMLOC,JN)*F_RLAPIN(JN-1)*PVOR(II,JI+1,KMLOC)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F_RLAPIN(JN+1)*PVOR(II,JI-1,KMLOC) + PV(IR,JI,KMLOC) = -ZKM*F_RLAPIN(JN)*PVOR(II,JI,KMLOC)-& + &(JN-1)*PEPSNM(KMLOC,JN)*F_RLAPIN(JN-1)*PDIV(IR,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F_RLAPIN(JN+1)*PDIV(IR,JI-1,KMLOC) + PV(II,JI,KMLOC) = +ZKM*F_RLAPIN(JN)*PVOR(IR,JI,KMLOC)-& + &(JN-1)*PEPSNM(KMLOC,JN)*F_RLAPIN(JN-1)*PDIV(II,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F_RLAPIN(JN+1)*PDIV(II,JI-1,KMLOC) + + ELSEIF(KM == 0) THEN + ! (DO JN=0,R_NTMAX) + JI = R_NTMAX+3-JN + + PU(IR,JI,KMLOC) = +& + &(JN-1)*PEPSNM(KMLOC,JN)*F_RLAPIN(JN-1)*PVOR(IR,JI+1,KMLOC)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F_RLAPIN(JN+1)*PVOR(IR,JI-1,KMLOC) + PV(IR,JI,KMLOC) = -& + &(JN-1)*PEPSNM(KMLOC,JN)*F_RLAPIN(JN-1)*PDIV(IR,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F_RLAPIN(JN+1)*PDIV(IR,JI-1,KMLOC) + ENDIF + ENDDO + ENDDO +ENDDO + +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif +#ifdef ACCGPU +!$ACC END DATA +#endif +! ------------------------------------------------------------------ + +END SUBROUTINE VDTUV +END MODULE VDTUV_MOD + diff --git a/src/trans/gpu/internal/write_legpol_mod.F90 b/src/trans/gpu/internal/write_legpol_mod.F90 new file mode 100755 index 00000000..a3cf5d7e --- /dev/null +++ b/src/trans/gpu/internal/write_legpol_mod.F90 @@ -0,0 +1,187 @@ +! (C) Copyright 2015- ECMWF. +! (C) Copyright 2015- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE WRITE_LEGPOL_MOD +CONTAINS +SUBROUTINE WRITE_LEGPOL +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +USE TPM_GEN +USE TPM_DISTR +USE TPM_DIM +USE TPM_GEOMETRY +USE TPM_FLT +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE TPM_CTL +USE BYTES_IO_MOD + +!**** *WRITE_LEGPOL * - write out Leg.Pol. and assocciated arrays to file + +! Purpose. +! -------- +! + +!** Interface. +! ---------- +! *CALL* *WRITE_LEGPOL* + +! Explicit arguments : None +! -------------------- + +! Implicit arguments : +! -------------------- +! + +! Method. +! ------- +! See documentation + +! Externals. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS +! + +! ------- +! Mats Hamrud and Willem Deconinck *ECMWF* + +! Modifications. +! -------------- +! Original : July 2015 + +IMPLICIT NONE + +INTEGER(KIND=JPIM),PARAMETER :: JPIBUFL=4 +INTEGER(KIND=JPIM) :: IRBYTES,IIBYTES,JMLOC,IPRTRV,IMLOC,IM,ILA,ILS,IFILE,JSETV +INTEGER(KIND=JPIM) :: IDGLU,ISIZE,IBYTES,IRET,IBUF(JPIBUFL),IDUM,JGL,II +INTEGER(KIND=JPIM) :: IDGLU2 +REAL(KIND=JPRBT) ,ALLOCATABLE :: ZBUF(:) +INTEGER(KIND=JPIM) ,ALLOCATABLE :: IBUFA(:) +! ------------------------------------------------------------------ + +IRBYTES = 8 +IIBYTES = 4 +IDUM = 3141 + +IF(C%CIO_TYPE == 'file') THEN + CALL BYTES_IO_OPEN(IFILE,C%CLEGPOLFNAME,'W',IRET) + IF ( IRET < JPBYTES_IO_SUCCESS ) CALL ABORT_TRANS('WRITE_LEGPOL: BYTES_IO_OPEN FAILED') +ENDIF +IBUF(1:2) = TRANSFER('LEGPOL ',IBUF(1:2)) +IBUF(3) = R%NSMAX +IBUF(4) = R%NDGNH +CALL BYTES_IO_WRITE(IFILE,IBUF,JPIBUFL*IIBYTES,IRET) +IF ( IRET < JPBYTES_IO_SUCCESS ) CALL ABORT_TRANS('WRITE_LEGPOL: BYTES_IO_WRITE FAILED') +ALLOCATE(IBUFA(2*R%NDGNH)) +II = 0 +DO JGL=1,R%NDGNH + II = II+1 + IBUFA(II) = G%NLOEN(JGL) + II=II+1 + IBUFA(II) = G%NMEN(JGL) +ENDDO +CALL BYTES_IO_WRITE(IFILE,IBUFA,2*R%NDGNH*IIBYTES,IRET) +IF ( IRET < JPBYTES_IO_SUCCESS ) CALL ABORT_TRANS('WRITE_LEGPOL: BYTES_IO_WRITE FAILED') +DEALLOCATE(IBUFA) +DO JMLOC=1,D%NUMP,NPRTRV ! +++++++++++++++++++++ JMLOC LOOP ++++++++++ + IPRTRV=MIN(NPRTRV,D%NUMP-JMLOC+1) + DO JSETV=1,IPRTRV + IMLOC=JMLOC+JSETV-1 + IM = D%MYMS(IMLOC) + ILA = (R%NSMAX-IM+2)/2 + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) +! Anti-symmetric + ISIZE = IDGLU*ILA + IBYTES = ISIZE*IRBYTES + ALLOCATE(ZBUF(ISIZE)) + ZBUF(:) = RESHAPE(S%FA(IMLOC)%RPNMA,(/ISIZE/)) + CALL BYTES_IO_WRITE(IFILE,ZBUF,IBYTES,IRET) + IF( IRET < JPBYTES_IO_SUCCESS ) THEN + WRITE(0,*) 'BYTES_IO_WRITE ',IFILE,' ',IBYTES,' FAILED',IRET + CALL ABORT_TRANS('WRITE_LEGPOL:BYTES_IO_WRITE FAILED') + ENDIF + DEALLOCATE(ZBUF) +! Symmetric + ISIZE = IDGLU*ILS + IBYTES = ISIZE*IRBYTES + ALLOCATE(ZBUF(ISIZE)) + ZBUF(:) = RESHAPE(S%FA(IMLOC)%RPNMS,(/ISIZE/)) + CALL BYTES_IO_WRITE(IFILE,ZBUF,IBYTES,IRET) + IF( IRET < JPBYTES_IO_SUCCESS ) THEN + WRITE(0,*) 'BYTES_IO_WRITE ',IFILE,' ',IBYTES,' FAILED',IRET + CALL ABORT_TRANS('WRITE_LEGPOL:BYTES_IO_WRITE FAILED') + ENDIF + DEALLOCATE(ZBUF) + ENDDO +ENDDO + +! Lat-lon grid + +IF(S%LDLL) THEN + IBUF(:) = TRANSFER('LATLON---BEG-BEG',IBUF(1:4)) + CALL BYTES_IO_WRITE(IFILE,IBUF,JPIBUFL*IIBYTES,IRET) + IF( IRET < JPBYTES_IO_SUCCESS ) THEN + CALL ABORT_TRANS('WRITE_LEGPOL:BYTES_IO_WRITE FAILED') + ENDIF + DO JMLOC=1,D%NUMP + IM = D%MYMS(JMLOC) + ILA = (R%NSMAX-IM+2)/2 + ILS = (R%NSMAX-IM+3)/2 + IDGLU = MIN(R%NDGNH,G%NDGLU(IM)) + IDGLU2 = S%NDGNHD + IBUF(:) = (/IM,IDGLU,IDGLU2,IDUM/) + CALL BYTES_IO_WRITE(IFILE,IBUF,JPIBUFL*IIBYTES,IRET) + IF( IRET < JPBYTES_IO_SUCCESS ) THEN + WRITE(0,*) 'BYTES_IO_WRITE ',IFILE,' ',IIBYTES,' FAILED',IRET + CALL ABORT_TRANS('WRITE_LEGPOL:BYTES_IO_WRITE FAILED') + ENDIF + + ISIZE = 2*IDGLU*2 + IBYTES = ISIZE*IRBYTES + ALLOCATE(ZBUF(ISIZE)) + ZBUF(:) = RESHAPE(S%FA(JMLOC)%RPNMWI,(/ISIZE/)) + CALL BYTES_IO_WRITE(IFILE,ZBUF,IBYTES,IRET) + IF( IRET < JPBYTES_IO_SUCCESS ) THEN + WRITE(0,*) 'BYTES_IO_WRITE ',IFILE,' ',IBYTES,' FAILED',IRET + CALL ABORT_TRANS('WRITE_LEGPOL:BYTES_IO_WRITE FAILED') + ENDIF + DEALLOCATE(ZBUF) + + ISIZE = 2*IDGLU2*2 + IBYTES = ISIZE*IRBYTES + ALLOCATE(ZBUF(ISIZE)) + ZBUF(:) = RESHAPE(S%FA(JMLOC)%RPNMWO,(/ISIZE/)) + CALL BYTES_IO_WRITE(IFILE,ZBUF,IBYTES,IRET) + IF( IRET < JPBYTES_IO_SUCCESS ) THEN + WRITE(0,*) 'BYTES_IO_WRITE ',IFILE,' ',IBYTES,' FAILED',IRET + CALL ABORT_TRANS('WRITE_LEGPOL:BYTES_IO_WRITE FAILED') + ENDIF + DEALLOCATE(ZBUF) + + ENDDO +ENDIF +!End marker +IBUF(:) = TRANSFER('LEGPOL---EOF-EOF',IBUF(1:4)) +CALL BYTES_IO_WRITE(IFILE,IBUF,JPIBUFL*IIBYTES,IRET) +IF( IRET < JPBYTES_IO_SUCCESS ) THEN + CALL ABORT_TRANS('WRITE_LEGPOL:BYTES_IO_WRITE FAILED') +ENDIF + +IF(C%CIO_TYPE == 'file') THEN + CALL BYTES_IO_CLOSE(IFILE,IRET) + IF( IRET < JPBYTES_IO_SUCCESS ) THEN + CALL ABORT_TRANS('WRITE_LEGPOL:BYTES_IO_CLOSE FAILED') + ENDIF +ENDIF + +END SUBROUTINE WRITE_LEGPOL +END MODULE WRITE_LEGPOL_MOD diff --git a/src/trans/gpu/sharedmem/sharedmem.c b/src/trans/gpu/sharedmem/sharedmem.c new file mode 100644 index 00000000..29426cea --- /dev/null +++ b/src/trans/gpu/sharedmem/sharedmem.c @@ -0,0 +1,28 @@ +/* + * (C) Copyright 2015- ECMWF. + * + * This software is licensed under the terms of the Apache Licence Version 2.0 + * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. + * In applying this licence, ECMWF does not waive the privileges and immunities + * granted to it by virtue of its status as an intergovernmental organisation + * nor does it submit to any jurisdiction. + */ + + +#include + +void sharedmem_malloc_bytes (void** ptr, size_t bytes) +{ + *ptr = malloc(bytes); +} + +void sharedmem_free(void** ptr) +{ + free(*ptr); +} + +void sharedmem_advance_bytes (void** ptr, size_t bytes) +{ + char** char_ptr = (char**)ptr; + *char_ptr += bytes; +} diff --git a/src/trans/gpu/sharedmem/sharedmem_mod.F90 b/src/trans/gpu/sharedmem/sharedmem_mod.F90 new file mode 100644 index 00000000..bb28a489 --- /dev/null +++ b/src/trans/gpu/sharedmem/sharedmem_mod.F90 @@ -0,0 +1,314 @@ +! (C) Copyright 2015- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE SHAREDMEM_MOD + +! Routines to allow use of shared memery segments in Fortran + + +! Willem Deconinck and Mats Hamrud *ECMWF* +! Original : July 2015 + + +USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT, C_NULL_PTR,C_SIZE_T + +#ifdef __NEC__ +#define C_SIZEOF(x) INT(KIND(x),C_SIZE_T) +#endif + +IMPLICIT NONE + +PRIVATE + +PUBLIC :: SHAREDMEM +PUBLIC :: SHAREDMEM_ALLOCATE +PUBLIC :: SHAREDMEM_MALLOC_BYTES +PUBLIC :: SHAREDMEM_CREATE +PUBLIC :: SHAREDMEM_ASSOCIATE +PUBLIC :: SHAREDMEM_ADVANCE +PUBLIC :: SHAREDMEM_DELETE + +TYPE, BIND(C) :: SHAREDMEM +! Memory buffer + TYPE(C_PTR), PRIVATE :: BEGIN=C_NULL_PTR + INTEGER(C_SIZE_T), PRIVATE :: SIZE=0 ! IN BYTES + TYPE(C_PTR), PRIVATE :: CPTR=C_NULL_PTR + INTEGER(C_SIZE_T), PRIVATE :: OFFSET=0 ! IN BYTES +END TYPE SHAREDMEM + + +INTERFACE SHAREDMEM_ASSOCIATE +! Associate fortran scalars/arrays with memory segment + MODULE PROCEDURE SHAREDMEM_ASSOCIATE0_INT32 + MODULE PROCEDURE SHAREDMEM_ASSOCIATE0_REAL32 + MODULE PROCEDURE SHAREDMEM_ASSOCIATE0_REAL64 + MODULE PROCEDURE SHAREDMEM_ASSOCIATE1_INT32 + MODULE PROCEDURE SHAREDMEM_ASSOCIATE1_REAL32 + MODULE PROCEDURE SHAREDMEM_ASSOCIATE1_REAL64 + MODULE PROCEDURE SHAREDMEM_ASSOCIATE2_INT32 + MODULE PROCEDURE SHAREDMEM_ASSOCIATE2_REAL32 + MODULE PROCEDURE SHAREDMEM_ASSOCIATE2_REAL64 +END INTERFACE + + +INTERFACE + +! EXTERNAL C FUNCTIONS USED IN THIS MODULE +! ---------------------------------------- + + SUBROUTINE SHAREDMEM_ADVANCE_BYTES(CPTR,BYTES) BIND(C) + USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_SIZE_T + TYPE(C_PTR) :: CPTR + INTEGER(C_SIZE_T), VALUE :: BYTES + END SUBROUTINE SHAREDMEM_ADVANCE_BYTES + + SUBROUTINE SHAREDMEM_MALLOC_BYTES(PTR,BYTES) BIND(C) + USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_SIZE_T + TYPE(C_PTR) :: PTR + INTEGER(C_SIZE_T), VALUE :: BYTES + END SUBROUTINE SHAREDMEM_MALLOC_BYTES + + SUBROUTINE SHAREDMEM_FREE(PTR) BIND(C) + USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR + TYPE(C_PTR), INTENT(IN) :: PTR + END SUBROUTINE SHAREDMEM_FREE + +END INTERFACE + +CONTAINS +!========================================================================= +SUBROUTINE SHAREDMEM_CREATE(HANDLE,CPTR,BYTES) +! Create memory buffer object from c pointer +USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_SIZE_T, C_F_POINTER +TYPE(SHAREDMEM), INTENT(OUT) :: HANDLE +TYPE(C_PTR) , INTENT(IN) :: CPTR +INTEGER(C_SIZE_T), INTENT(IN) :: BYTES +!------------------------------------------------------------------------ +HANDLE%BEGIN = CPTR +HANDLE%SIZE = BYTES +HANDLE%CPTR = HANDLE%BEGIN +HANDLE%OFFSET = 0 +END SUBROUTINE SHAREDMEM_CREATE +!========================================================================= +SUBROUTINE SHAREDMEM_ALLOCATE(HANDLE,BYTES) +! Create memory buffer object from Fortran +USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_SIZE_T +TYPE(SHAREDMEM), INTENT(OUT) :: HANDLE +INTEGER(C_SIZE_T), INTENT(IN) :: BYTES +INTEGER(C_SIZE_T) :: SIZE +!------------------------------------------------------------------------ +SIZE = BYTES +CALL SHAREDMEM_MALLOC_BYTES(HANDLE%BEGIN,SIZE) +HANDLE%SIZE = BYTES +HANDLE%CPTR = HANDLE%BEGIN +HANDLE%OFFSET = 0 +END SUBROUTINE SHAREDMEM_ALLOCATE +!========================================================================= +SUBROUTINE SHAREDMEM_DELETE(HANDLE) +! Free memory buffer +TYPE(SHAREDMEM), INTENT(OUT) :: HANDLE +CALL SHAREDMEM_FREE(HANDLE%BEGIN) +END SUBROUTINE SHAREDMEM_DELETE +!========================================================================= + +! PRIVATE SUBROUTINES +! ------------------- + +SUBROUTINE SHAREDMEM_ASSOCIATE0_INT32(HANDLE,VALUE,ADVANCE) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + INTEGER(C_INT), INTENT(OUT) :: VALUE + LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE + INTEGER(C_INT), POINTER :: FPTR(:) + INTEGER(C_INT) :: K + + CALL C_F_POINTER ( HANDLE%CPTR , FPTR , (/1/) ) + VALUE = FPTR(1) + + IF( PRESENT(ADVANCE) ) THEN + IF( ADVANCE ) THEN + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,C_SIZEOF(K)) + HANDLE%OFFSET = HANDLE%OFFSET+C_SIZEOF(K) + ENDIF + ENDIF + +END SUBROUTINE SHAREDMEM_ASSOCIATE0_INT32 + +SUBROUTINE SHAREDMEM_ASSOCIATE0_REAL32(HANDLE,VALUE,ADVANCE) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + REAL(C_FLOAT), INTENT(OUT) :: VALUE + LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE + REAL(C_FLOAT), POINTER :: FPTR(:) + REAL(C_FLOAT) :: R + + CALL C_F_POINTER ( HANDLE%CPTR , FPTR , (/1/) ) + VALUE = FPTR(1) + + IF( PRESENT(ADVANCE) ) THEN + IF( ADVANCE ) THEN + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,C_SIZEOF(R)) + HANDLE%OFFSET = HANDLE%OFFSET+C_SIZEOF(R) + + ENDIF + ENDIF + +END SUBROUTINE SHAREDMEM_ASSOCIATE0_REAL32 + +SUBROUTINE SHAREDMEM_ASSOCIATE0_REAL64(HANDLE,VALUE,ADVANCE) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + REAL(C_DOUBLE), INTENT(OUT) :: VALUE + LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE + REAL(C_DOUBLE), POINTER :: FPTR(:) + REAL(C_DOUBLE) :: R + + CALL C_F_POINTER ( HANDLE%CPTR , FPTR , (/1/) ) + VALUE = FPTR(1) + + IF( PRESENT(ADVANCE) ) THEN + IF( ADVANCE ) THEN + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,C_SIZEOF(R)) + HANDLE%OFFSET = HANDLE%OFFSET+C_SIZEOF(R) + + ENDIF + ENDIF + +END SUBROUTINE SHAREDMEM_ASSOCIATE0_REAL64 + +SUBROUTINE SHAREDMEM_ASSOCIATE1_INT32(HANDLE,SIZE,FPTR,ADVANCE) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + INTEGER(C_INT), INTENT(IN) :: SIZE + INTEGER(KIND=C_INT), POINTER, INTENT(INOUT) :: FPTR(:) + LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE + INTEGER(C_INT) :: K + + CALL C_F_POINTER ( HANDLE%CPTR , FPTR , (/SIZE/) ) + + IF( PRESENT(ADVANCE) ) THEN + IF( ADVANCE ) THEN + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,SIZE*C_SIZEOF(K)) + HANDLE%OFFSET = HANDLE%OFFSET+SIZE*C_SIZEOF(K) + ENDIF + ENDIF + +END SUBROUTINE SHAREDMEM_ASSOCIATE1_INT32 + + +SUBROUTINE SHAREDMEM_ASSOCIATE1_REAL32(HANDLE,SIZE,FPTR,ADVANCE) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + INTEGER(C_INT), INTENT(IN) :: SIZE + REAL(C_FLOAT), POINTER, INTENT(INOUT) :: FPTR(:) + LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE + REAL(C_FLOAT) :: R + + CALL C_F_POINTER ( HANDLE%CPTR , FPTR , (/SIZE/) ) + + IF( PRESENT(ADVANCE) ) THEN + IF( ADVANCE ) THEN + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,SIZE*C_SIZEOF(R)) + HANDLE%OFFSET = HANDLE%OFFSET+SIZE*C_SIZEOF(R) + ENDIF + ENDIF + +END SUBROUTINE SHAREDMEM_ASSOCIATE1_REAL32 + + +SUBROUTINE SHAREDMEM_ASSOCIATE1_REAL64(HANDLE,SIZE,FPTR,ADVANCE) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + INTEGER(C_INT), INTENT(IN) :: SIZE + REAL(C_DOUBLE), POINTER, INTENT(INOUT) :: FPTR(:) + LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE + REAL(C_DOUBLE) :: R + + CALL C_F_POINTER ( HANDLE%CPTR , FPTR , (/SIZE/) ) + + IF( PRESENT(ADVANCE) ) THEN + IF( ADVANCE ) THEN + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,SIZE*C_SIZEOF(R)) + HANDLE%OFFSET = HANDLE%OFFSET+SIZE*C_SIZEOF(R) + ENDIF + ENDIF + +END SUBROUTINE SHAREDMEM_ASSOCIATE1_REAL64 + +SUBROUTINE SHAREDMEM_ASSOCIATE2_INT32(HANDLE,DIM1,DIM2,FPTR,ADVANCE) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + INTEGER(C_INT), INTENT(IN) :: DIM1,DIM2 + INTEGER(C_INT), POINTER, INTENT(INOUT) :: FPTR(:,:) + LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE + INTEGER(C_INT) :: K + + CALL C_F_POINTER ( HANDLE%CPTR , FPTR , (/DIM1,DIM2/) ) + + IF( PRESENT(ADVANCE) ) THEN + IF( ADVANCE ) THEN + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,DIM1*DIM2*C_SIZEOF(K)) + HANDLE%OFFSET = HANDLE%OFFSET+DIM1*DIM2*C_SIZEOF(K) + ENDIF + ENDIF + +END SUBROUTINE SHAREDMEM_ASSOCIATE2_INT32 + + +SUBROUTINE SHAREDMEM_ASSOCIATE2_REAL32(HANDLE,DIM1,DIM2,FPTR,ADVANCE) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + INTEGER(C_INT), INTENT(IN) :: DIM1,DIM2 + REAL(C_FLOAT), POINTER, INTENT(INOUT) :: FPTR(:,:) + LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE + REAL(C_FLOAT) :: R + + CALL C_F_POINTER ( HANDLE%CPTR , FPTR , (/DIM1,DIM2/) ) + + IF( PRESENT(ADVANCE) ) THEN + IF( ADVANCE ) THEN + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,DIM1*DIM2*C_SIZEOF(R)) + HANDLE%OFFSET = HANDLE%OFFSET+DIM1*DIM2*C_SIZEOF(R) + ENDIF + ENDIF + +END SUBROUTINE SHAREDMEM_ASSOCIATE2_REAL32 + + +SUBROUTINE SHAREDMEM_ASSOCIATE2_REAL64(HANDLE,DIM1,DIM2,FPTR,ADVANCE) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + INTEGER(C_INT), INTENT(IN) :: DIM1,DIM2 + REAL(C_DOUBLE), POINTER, INTENT(INOUT) :: FPTR(:,:) + LOGICAL, OPTIONAL, INTENT(IN) :: ADVANCE + REAL(C_DOUBLE) :: R + + CALL C_F_POINTER ( HANDLE%CPTR , FPTR , (/DIM1,DIM2/) ) + + IF( PRESENT(ADVANCE) ) THEN + IF( ADVANCE ) THEN + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,DIM1*DIM2*C_SIZEOF(R)) + HANDLE%OFFSET = HANDLE%OFFSET+DIM1*DIM2*C_SIZEOF(R) + ENDIF + ENDIF + +END SUBROUTINE SHAREDMEM_ASSOCIATE2_REAL64 + +SUBROUTINE SHAREDMEM_ADVANCE(HANDLE,BYTES) + USE, INTRINSIC :: ISO_C_BINDING + TYPE(SHAREDMEM), INTENT(INOUT) :: HANDLE + INTEGER(C_INT), INTENT(IN) :: BYTES + INTEGER(C_SIZE_T) :: SIZE + SIZE = BYTES + CALL SHAREDMEM_ADVANCE_BYTES(HANDLE%CPTR,SIZE) + HANDLE%OFFSET = HANDLE%OFFSET+BYTES +END SUBROUTINE SHAREDMEM_ADVANCE + +!============================================================================ +END MODULE SHAREDMEM_MOD diff --git a/src/trans/include/ectrans/vordiv_to_uv.h b/src/trans/include/ectrans/vordiv_to_uv.h index 904829d6..fe7e8f6e 100644 --- a/src/trans/include/ectrans/vordiv_to_uv.h +++ b/src/trans/include/ectrans/vordiv_to_uv.h @@ -65,4 +65,4 @@ INTEGER(KIND=JPIM) , INTENT(IN) :: KSMAX INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) END SUBROUTINE VORDIV_TO_UV -END INTERFACE \ No newline at end of file +END INTERFACE diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7b86f807..bae43508 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -36,84 +36,85 @@ if( HAVE_TESTS ) add_test( NAME ectrans_test_install COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test-install.sh ${_test_args} ) -endif() - -if( HAVE_DOUBLE_PRECISION ) - set( trans trans_dp ) - set( parkind parkind_dp ) -else() - set( trans trans_sp ) - set( parkind parkind_sp ) -endif() + if( HAVE_DOUBLE_PRECISION ) + set( trans trans_dp ) + set( parkind parkind_dp ) + else() + set( trans trans_sp ) + set( parkind parkind_sp ) + endif() -ecbuild_add_test(TARGET ectrans_test_adjoint - SOURCES trans/test_adjoint.F90 - LIBS ${trans} ${parkind} - LINKER_LANGUAGE Fortran -) -if( TEST ectrans_test_adjoint AND HAVE_OMP ) - target_link_libraries( ectrans_test_adjoint OpenMP::OpenMP_Fortran ) -endif() + ecbuild_add_test(TARGET ectrans_test_adjoint + SOURCES trans/test_adjoint.F90 + LIBS ${trans} ${parkind} + LINKER_LANGUAGE Fortran + ) + if( TEST ectrans_test_adjoint AND HAVE_OMP ) + target_link_libraries( ectrans_test_adjoint OpenMP::OpenMP_Fortran ) + endif() -set( ntasks 0 ) -set( nthreads 1 ) -if( HAVE_MPI ) - list( APPEND ntasks 1 2 ) -endif() -if( HAVE_OMP ) - list( APPEND nthreads 4 8 ) -endif() + set( ntasks 0 ) + set( nthreads 1 ) + if( HAVE_MPI ) + list( APPEND ntasks 1 2 ) + endif() + if( HAVE_OMP ) + list( APPEND nthreads 4 8 ) + endif() -foreach( prec dp sp ) - foreach( mpi ${ntasks} ) - foreach( omp ${nthreads} ) - set( t 47 ) - set( grid O48 ) - ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld0 - COMMAND ectrans-benchmark-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 0 --meminfo --check 100 --norms -v - MPI ${mpi} - OMP ${omp} - ) - ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10 - COMMAND ectrans-benchmark-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --meminfo --check 100 --norms -v - MPI ${mpi} - OMP ${omp} - ) - ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20 - COMMAND ectrans-benchmark-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --check 100 --norms -v - MPI ${mpi} - OMP ${omp} - ) - ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_scders - COMMAND ectrans-benchmark-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --scders --check 100 --norms -v - MPI ${mpi} - OMP ${omp} - ) - ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_vordiv - COMMAND ectrans-benchmark-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --vordiv --check 100 --norms -v - MPI ${mpi} - OMP ${omp} - ) - ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_vordiv_uvders - COMMAND ectrans-benchmark-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --vordiv --uvders --check 100 --norms -v - MPI ${mpi} - OMP ${omp} - ) - ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_flt - COMMAND ectrans-benchmark-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --flt --check 1000 --norms -v - MPI ${mpi} - OMP ${omp} - ) - ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_nproma16 - COMMAND ectrans-benchmark-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --nproma 16 --check 100 --norms -v - MPI ${mpi} - OMP ${omp} - ) - endforeach() + foreach( prec dp sp ) + if( TARGET ectrans-benchmark-cpu-${prec} ) + foreach( mpi ${ntasks} ) + foreach( omp ${nthreads} ) + set( t 47 ) + set( grid O48 ) + ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld0 + COMMAND ectrans-benchmark-cpu-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 0 --meminfo --check 100 --norms -v + MPI ${mpi} + OMP ${omp} + ) + ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10 + COMMAND ectrans-benchmark-cpu-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --meminfo --check 100 --norms -v + MPI ${mpi} + OMP ${omp} + ) + ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20 + COMMAND ectrans-benchmark-cpu-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --check 100 --norms -v + MPI ${mpi} + OMP ${omp} + ) + ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_scders + COMMAND ectrans-benchmark-cpu-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --scders --check 100 --norms -v + MPI ${mpi} + OMP ${omp} + ) + ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_vordiv + COMMAND ectrans-benchmark-cpu-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --vordiv --check 100 --norms -v + MPI ${mpi} + OMP ${omp} + ) + ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_vordiv_uvders + COMMAND ectrans-benchmark-cpu-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --vordiv --uvders --check 100 --norms -v + MPI ${mpi} + OMP ${omp} + ) + ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_flt + COMMAND ectrans-benchmark-cpu-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --flt --check 2000 --norms -v + MPI ${mpi} + OMP ${omp} + ) + ecbuild_add_test( TARGET ectrans_test_benchmark_${prec}_T${t}_${grid}_mpi${mpi}_omp${omp}_nfld10_nlev20_nproma16 + COMMAND ectrans-benchmark-cpu-${prec} ARGS --truncation ${t} --grid ${grid} --niter 2 --nfld 10 --nlev 20 --nproma 16 --check 100 --norms -v + MPI ${mpi} + OMP ${omp} + ) + endforeach() + endforeach() + endif() endforeach() -endforeach() +endif() if( HAVE_TRANSI ) From d17f2b4c3b7db9c1916b31842d60e911ff4d21f3 Mon Sep 17 00:00:00 2001 From: David Davies Date: Fri, 19 Jul 2024 15:40:23 +0100 Subject: [PATCH 47/48] Remove CONTIGUOUS from dist_spec_control_argument (#98) Co-authored-by: Willem Deconinck --- cmake/ectrans_compile_options.cmake | 13 +++++++++++++ src/trans/cpu/CMakeLists.txt | 7 +++++++ src/trans/cpu/internal/dist_spec_control_mod.F90 | 10 +++++++++- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/cmake/ectrans_compile_options.cmake b/cmake/ectrans_compile_options.cmake index 6815d6da..37b4171e 100644 --- a/cmake/ectrans_compile_options.cmake +++ b/cmake/ectrans_compile_options.cmake @@ -30,6 +30,19 @@ elseif( CMAKE_Fortran_COMPILER_ID MATCHES "Intel" ) ecbuild_add_fortran_flags("-fast-transcendentals -fp-model precise -fp-speculation=safe") endif() +if( NOT DEFINED ECTRANS_HAVE_CONTIGUOUS_ISSUE ) + if( CMAKE_Fortran_COMPILER_ID MATCHES "Intel" ) + if( CMAKE_Fortran_COMPILER_VERSION VERSION_LESS_EQUAL 19) + set( ECTRANS_HAVE_CONTIGUOUS_ISSUE True ) + endif() + elseif( CMAKE_Fortran_COMPILER_ID MATCHES "GNU" ) + if( CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL "9.2" + OR CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL "12.2.0" ) + set( ECTRANS_HAVE_CONTIGUOUS_ISSUE True ) + endif() + endif() +endif() + macro( ectrans_add_compile_options ) set( options ) set( single_value_args FLAGS ) diff --git a/src/trans/cpu/CMakeLists.txt b/src/trans/cpu/CMakeLists.txt index 01f5bcb8..d0d13722 100644 --- a/src/trans/cpu/CMakeLists.txt +++ b/src/trans/cpu/CMakeLists.txt @@ -253,6 +253,13 @@ foreach( prec dp sp ) target_link_libraries( ectrans_${prec} PRIVATE OpenMP::OpenMP_Fortran ) endif() + if( ECTRANS_HAVE_CONTIGUOUS_ISSUE ) + # See https://github.com/ecmwf-ifs/ectrans/pull/98 + # There is a problem with CONTIGUOUS keyword in dist_spec_control_mod.F90 + ecbuild_debug("target_compile_definitions( ectrans_${prec} PRIVATE CONTIG_BUGGY_COMPILER)") + target_compile_definitions( ectrans_${prec} PRIVATE CONTIG_BUGGY_COMPILER) + endif() + # This interface library is for backward compatibility, and provides the older includes ecbuild_add_library( TARGET trans_${prec} TYPE INTERFACE ) target_include_directories( trans_${prec} INTERFACE $ ) diff --git a/src/trans/cpu/internal/dist_spec_control_mod.F90 b/src/trans/cpu/internal/dist_spec_control_mod.F90 index a57761f2..2b6b7b8e 100644 --- a/src/trans/cpu/internal/dist_spec_control_mod.F90 +++ b/src/trans/cpu/internal/dist_spec_control_mod.F90 @@ -67,7 +67,15 @@ SUBROUTINE DIST_SPEC_CONTROL(PSPECG,KFDISTG,KFROM,KVSET,PSPEC,LDIM1_IS_FLD,& IMPLICIT NONE -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN), CONTIGUOUS :: PSPECG(:,:) +! See https://github.com/ecmwf-ifs/ectrans/pull/98 +! There is a problem with CONTIGUOUS keyword +#ifndef CONTIG_BUGGY_COMPILER +#define CONTIG_STATUS ,CONTIGUOUS +#else +#define CONTIG_STATUS +#endif + +REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) CONTIG_STATUS :: PSPECG(:,:) INTEGER(KIND=JPIM) , INTENT(IN) :: KFDISTG INTEGER(KIND=JPIM) , INTENT(IN) :: KFROM(:) INTEGER(KIND=JPIM) , INTENT(IN) :: KVSET(:) From 868b71f7f5a005a070698b0d7a806cad1c07396e Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 23 Jul 2024 15:38:32 +0000 Subject: [PATCH 48/48] Version 1.4.0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 93b33ad5..88c5fb89 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.4.0-prerelease +1.4.0