forked from CICE-Consortium/CICE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathice_reprosum.F90
1477 lines (1262 loc) · 60.9 KB
/
ice_reprosum.F90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
!> Reproducible sum method from P. Worley
MODULE ice_reprosum
!-----------------------------------------------------------------------
!
! Purpose:
!> Compute reproducible global sums of a set of arrays across an MPI
!> subcommunicator
!
! Methods:
!> Compute using either or both a scalable, reproducible algorithm and a
!> scalable, nonreproducible algorithm:
!> * Reproducible (scalable):
!> Convert to fixed point (integer vector representation) to enable
!> reproducibility when using MPI_Allreduce
!> * Alternative usually reproducible (scalable):
!> Use parallel double-double algorithm due to Helen He and
!> Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm
!> * Nonreproducible (scalable):
!> Floating point and MPI_Allreduce based.
!> If computing both reproducible and nonreproducible sums, compare
!> these and report relative difference (if absolute difference
!> less than sum) or absolute difference back to calling routine.
!
! Author: P. Worley (based on suggestions from J. White for fixed
! point algorithm and on He/Ding paper for ddpdd
! algorithm)
!
! Modified by T.Craig for CICE, March 2019 based on the public version in
! Oasis3-MCT_4.0.
!
!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
!- use statements ------------------------------------------------------
!-----------------------------------------------------------------------
#ifndef SERIAL_REMOVE_MPI
use mpi ! MPI Fortran module
#endif
#if ( defined noI8 )
! Workaround for when shr_kind_i8 is not supported.
use ice_kinds_mod, only: r8 => dbl_kind, i8 => int_kind
#else
use ice_kinds_mod, only: r8 => dbl_kind, i8 => int8_kind
#endif
use ice_kinds_mod, only: char_len_long
use ice_fileunits, only: nu_diag
use ice_exit, only: abort_ice
! internal timers not yet implemented, need to revisit if needed
! use ice_mpi, only: xicex_mpi_barrier
! use ice_timer, only: xicex_timer_start, xicex_timer_stop
!-----------------------------------------------------------------------
!- module boilerplate --------------------------------------------------
!-----------------------------------------------------------------------
implicit none
private
!-----------------------------------------------------------------------
! Public interfaces ----------------------------------------------------
!-----------------------------------------------------------------------
public :: &
ice_reprosum_setopts, &! set runtime options
ice_reprosum_calc, &! calculate distributed sum
ice_reprosum_tolExceeded ! utility function to check relative
! differences against the tolerance
!-----------------------------------------------------------------------
! Public data ----------------------------------------------------------
!-----------------------------------------------------------------------
logical, public :: ice_reprosum_recompute = .false.
real(r8), public :: ice_reprosum_reldiffmax = -1.0_r8
!-----------------------------------------------------------------------
! Private interfaces ---------------------------------------------------
!-----------------------------------------------------------------------
private :: &
ddpdd, &! double-double sum routine
split_indices ! split indices among OMP threads
!-----------------------------------------------------------------------
! Private data ---------------------------------------------------------
!-----------------------------------------------------------------------
logical :: repro_sum_use_ddpdd = .false.
logical :: detailed_timing = .false.
character(len=char_len_long) :: tmpstr
CONTAINS
!========================================================================
!-----------------------------------------------------------------------
! Purpose:
!> Set runtime options
! Author: P. Worley
!-----------------------------------------------------------------------
subroutine ice_reprosum_setopts(repro_sum_use_ddpdd_in, &
repro_sum_rel_diff_max_in, &
repro_sum_recompute_in, &
repro_sum_master, &
repro_sum_logunit )
!------------------------------Arguments--------------------------------
logical, intent(in), optional :: repro_sum_use_ddpdd_in
!< Use DDPDD algorithm instead of fixed precision algorithm
real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
!< maximum permissible difference between reproducible and
!< nonreproducible sums
logical, intent(in), optional :: repro_sum_recompute_in
!< recompute using different algorithm when difference between
!< reproducible and nonreproducible sums is too great
logical, intent(in), optional :: repro_sum_master
!< flag indicating whether this process should output
!< log messages
integer, intent(in), optional :: repro_sum_logunit
!< unit number for log messages
!---------------------------Local Workspace-----------------------------
integer llogunit ! unit number for log messages
logical master ! local master?
logical,save :: firstcall = .true. ! first call
character(len=*),parameter :: subname = '(ice_reprosum_setopts)'
!-----------------------------------------------------------------------
if ( present(repro_sum_master) ) then
master = repro_sum_master
else
master = .false.
endif
if ( present(repro_sum_logunit) ) then
llogunit = repro_sum_logunit
else
llogunit = nu_diag
endif
if (.not. firstcall) then
write(tmpstr,*) subname//' ERROR: can only be called once'
call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
endif
firstcall = .false.
if ( present(repro_sum_use_ddpdd_in) ) then
repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
endif
if ( present(repro_sum_rel_diff_max_in) ) then
ice_reprosum_reldiffmax = repro_sum_rel_diff_max_in
endif
if ( present(repro_sum_recompute_in) ) then
ice_reprosum_recompute = repro_sum_recompute_in
endif
if (master) then
if ( repro_sum_use_ddpdd ) then
write(llogunit,*) subname, &
'Using double-double-based (scalable) usually reproducible ', &
'distributed sum algorithm'
else
write(llogunit,*) subname, &
'Using fixed-point-based (scalable) reproducible ', &
'distributed sum algorithm'
endif
if (ice_reprosum_reldiffmax >= 0._r8) then
write(llogunit,*) subname, &
' with a maximum relative error tolerance of ', &
ice_reprosum_reldiffmax
if (ice_reprosum_recompute) then
write(llogunit,*) subname, &
'If tolerance exceeded, sum is recomputed using ', &
'a serial algorithm.'
else
write(llogunit,*) subname, &
'If tolerance exceeded, fixed-precision is sum used ', &
'but a warning is output.'
endif
else
write(llogunit,*) subname, &
'and not comparing with floating point algorithms.'
endif
endif
end subroutine ice_reprosum_setopts
!========================================================================
!
! Purpose:
!> Compute the global sum of each field in "arr" using the indicated
!> communicator with a reproducible yet scalable implementation based
!> on a fixed point algorithm. An alternative is to use an "almost
!> always reproducible" floating point algorithm.
!
! The accuracy of the fixed point algorithm is controlled by the
! number of "levels" of integer expansion. The algorithm will calculate
! the number of levels that is required for the sum to be essentially
! exact. The optional parameter arr_max_levels can be used to override
! the calculated value. The optional parameter arr_max_levels_out can be
! used to return the values used.
!
! The algorithm also requires an upper bound on
! the maximum summand (in absolute value) for each field, and will
! calculate this internally. However, if the optional parameters
! arr_max_levels and arr_gbl_max are both set, then the algorithm will
! use the values in arr_gbl_max for the upper bounds instead. If these
! are not upper bounds, or if the upper bounds are not tight enough
! to achieve the requisite accuracy, and if the optional parameter
! repro_sum_validate is NOT set to .false., the algorithm will repeat the
! computation with appropriate upper bounds. If only arr_gbl_max is present,
! then the maxima are computed internally (and the specified values are
! ignored). The optional parameter arr_gbl_max_out can be
! used to return the values used.
!
! Finally, the algorithm requires an upper bound on the number of
! local summands across all processes. This will be calculated internally,
! using an MPI collective, but the value in the optional argument
! gbl_max_nsummands will be used instead if (1) it is present, (2)
! it is > 0, and (3) the maximum value and required number of levels
! are also specified. (If the maximum value is calculated, the same
! MPI collective is used to determine the maximum number of local
! summands.) The accuracy of the user-specified value is not checked.
! However, if set to < 1, the value will instead be calculated. If the
! optional parameter gbl_max_nsummands_out is present, then the value
! used (gbl_max_nsummands if >= 1; calculated otherwise) will be
! returned.
!
! If requested (by setting ice_reprosum_reldiffmax >= 0.0 and passing in
! the optional rel_diff parameter), results are compared with a
! nonreproducible floating point algorithm.
!
! Note that the cost of the algorithm is not strongly correlated with
! the number of levels, which primarily shows up as a (modest) increase
! in cost of the MPI_Allreduce as a function of vector length. Rather the
! cost is more a function of (a) the number of integers required to
! represent an individual summand and (b) the number of MPI_Allreduce
! calls. The number of integers required to represent an individual
! summand is 1 or 2 when using 8-byte integers for 8-byte real summands
! when the number of local summands is not too large. As the number of
! local summands increases, the number of integers required increases.
! The number of MPI_Allreduce calls is either 2 (specifying nothing) or
! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max
! correctly). When specifying arr_max_levels and arr_gbl_max
! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
!
! The alternative algorithm is a minor modification of a parallel
! implementation of David Bailey's routine DDPDD by Helen He
! and Chris Ding. Bailey uses the Knuth trick to implement quadruple
! precision summation of double precision values with 10 double
! precision operations. The advantage of this algorithm is that
! it requires a single MPI_Allreduce and is less expensive per summand
! than is the fixed precision algorithm. The disadvantage is that it
! is not guaranteed to be reproducible (though it is reproducible
! much more often than is the standard algorithm). This alternative
! is used when the optional parameter ddpdd_sum is set to .true. It is
! also used if the fixed precision algorithm radix assumption does not
! hold.
!----------------------------------------------------------------------
subroutine ice_reprosum_calc (arr, arr_gsum, nsummands, dsummands, &
nflds, ddpdd_sum, &
arr_gbl_max, arr_gbl_max_out, &
arr_max_levels, arr_max_levels_out, &
gbl_max_nsummands, gbl_max_nsummands_out,&
gbl_count, repro_sum_validate, &
repro_sum_stats, rel_diff, commid )
!----------------------------------------------------------------------
! Arguments
integer, intent(in) :: nsummands !< number of local summands
integer, intent(in) :: dsummands !< declared first dimension
integer, intent(in) :: nflds !< number of fields
real(r8), intent(in) :: arr(dsummands,nflds)
!< input array
real(r8), intent(out):: arr_gsum(nflds)
!< global means
logical, intent(in), optional :: ddpdd_sum
!< use ddpdd algorithm instead
!< of fixed precision algorithm
real(r8), intent(in), optional :: arr_gbl_max(nflds)
!< upper bound on max(abs(arr))
real(r8), intent(out), optional :: arr_gbl_max_out(nflds)
!< calculated upper bound on
!< max(abs(arr))
integer, intent(in), optional :: arr_max_levels(nflds)
!< maximum number of levels of
!< integer expansion to use
integer, intent(out), optional :: arr_max_levels_out(nflds)
!< output of number of levels of
!< integer expansion to used
integer, intent(in), optional :: gbl_max_nsummands
!< maximum of nsummand over all
!< processes
integer, intent(out), optional :: gbl_max_nsummands_out
!< calculated maximum nsummands
!< over all processes
integer, intent(in), optional :: gbl_count
!< was total number of summands;
!< now is ignored; use
!< gbl_max_nsummands instead
logical, intent(in), optional :: repro_sum_validate
!< flag enabling/disabling testing that gmax and max_levels are
!< accurate/sufficient. Default is enabled.
integer, intent(inout), optional :: repro_sum_stats(5)
!< increment running totals for
!< (1) one-reduction repro_sum
!< (2) two-reduction repro_sum
!< (3) both types in one call
!< (4) nonrepro_sum
!< (5) global max nsummands reduction
real(r8), intent(out), optional :: rel_diff(2,nflds)
!< relative and absolute
!< differences between fixed
!< and floating point sums
integer, intent(in), optional :: commid
!< MPI communicator
! Local workspace
logical :: use_ddpdd_sum ! flag indicating whether to
! use ice_reprosum_ddpdd or not
logical :: recompute ! flag indicating need to
! determine gmax/gmin before
! computing sum
logical :: validate ! flag indicating need to
! verify gmax and max_levels
! are accurate/sufficient
integer :: omp_nthreads ! number of OpenMP threads
integer :: mpi_comm ! MPI subcommunicator
integer :: tasks ! number of MPI processes
integer :: mype ! MPI task rank
integer :: ierr ! MPI error return
integer :: ifld, isum, ithread ! loop variables
integer :: max_nsummands ! max nsummands over all processes
! or threads (used in both ways)
integer, allocatable :: isum_beg(:), isum_end(:)
! range of summand indices for each
! OpenMP thread
integer, allocatable :: arr_tlmin_exp(:,:)
! per thread local exponent minima
integer, allocatable :: arr_tlmax_exp(:,:)
! per thread local exponent maxima
integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax
! summand exponent and working min/max
integer :: arr_lmin_exp(nflds) ! local exponent minima
integer :: arr_lmax_exp(nflds) ! local exponent maxima
integer :: arr_lextremes(0:nflds,2)! local exponent extrema
integer :: arr_gextremes(0:nflds,2)! global exponent extrema
integer :: arr_gmax_exp(nflds) ! global exponents maxima
integer :: arr_gmin_exp(nflds) ! global exponents minima
integer :: arr_max_shift ! maximum safe exponent for
! value < 1 (so that sum does
! not overflow)
integer :: max_levels(nflds) ! maximum number of levels of
! integer expansion to use
integer :: max_level ! maximum value in max_levels
integer :: gbl_max_red ! global max local sum reduction? (0/1)
integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1)
integer :: repro_sum_slow ! 2 reduction repro_sum? (0/1)
integer :: repro_sum_both ! both fast and slow? (0/1)
integer :: nonrepro_sum ! nonrepro_sum? (0/1)
real(r8) :: xmax_nsummands ! dble of max_nsummands
real(r8) :: arr_lsum(nflds) ! local sums
real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using
! fast, nonreproducible,
! floating point alg.
real(r8) :: abs_diff ! absolute difference between
! fixed and floating point
! sums
#ifdef _OPENMP
integer omp_get_max_threads
external omp_get_max_threads
#endif
character(len=*),parameter :: subname = '(ice_reprosum_calc)'
!-----------------------------------------------------------------------
! check whether should use ice_reprosum_ddpdd algorithm
use_ddpdd_sum = repro_sum_use_ddpdd
if ( present(ddpdd_sum) ) then
use_ddpdd_sum = ddpdd_sum
endif
! check whether intrinsic-based algorithm will work on this system
! (requires floating point and integer bases to be the same)
! If not, always use ddpdd.
use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8))
! initialize local statistics variables
gbl_max_red = 0
repro_sum_fast = 0
repro_sum_slow = 0
repro_sum_both = 0
nonrepro_sum = 0
! set MPI communicator
if ( present(commid) ) then
mpi_comm = commid
else
#ifdef SERIAL_REMOVE_MPI
mpi_comm = 0
#else
mpi_comm = MPI_COMM_WORLD
#endif
endif
! if (detailed_timing) then
! call xicex_timer_start('xicex_reprosum_prebarrier')
! call xicex_mpi_barrier(mpi_comm,subname)
! call xicex_timer_stop ('xicex_reprosum_prebarrier')
! endif
if ( use_ddpdd_sum ) then
! if (detailed_timing) call xicex_timer_start('ice_reprosum_ddpdd')
call ice_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
nflds, mpi_comm)
repro_sum_fast = 1
! if (detailed_timing) call xicex_timer_stop('ice_reprosum_ddpdd')
else
! if (detailed_timing) call xicex_timer_start('ice_reprosum_int')
! get number of MPI tasks
#ifdef SERIAL_REMOVE_MPI
tasks = 1
mype = 0
#else
call mpi_comm_size(mpi_comm, tasks, ierr)
call mpi_comm_rank(mpi_comm, mype, ierr)
#endif
! get number of OpenMP threads
#ifdef _OPENMP
omp_nthreads = omp_get_max_threads()
#else
omp_nthreads = 1
#endif
! see if have sufficient information to not require max/min allreduce
recompute = .true.
validate = .false.
if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
recompute = .false.
! setting lower bound on max_level*nflds to be 64 to improve OpenMP
! performance for loopb in ice_reprosum_int
max_level = (64/nflds) + 1
do ifld=1,nflds
if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. &
(arr_max_levels(ifld) > 0)) then
arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld))
if (max_level < arr_max_levels(ifld)) &
max_level = arr_max_levels(ifld)
else
recompute = .true.
endif
enddo
if (.not. recompute) then
! determine maximum number of summands in local phases of the
! algorithm
! if (detailed_timing) call xicex_timer_start("repro_sum_allr_max")
if ( present(gbl_max_nsummands) ) then
if (gbl_max_nsummands < 1) then
#ifdef SERIAL_REMOVE_MPI
max_nsummands = nsummands
#else
call mpi_allreduce (nsummands, max_nsummands, 1, &
MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
#endif
gbl_max_red = 1
else
max_nsummands = gbl_max_nsummands
endif
else
#ifdef SERIAL_REMOVE_MPI
max_nsummands = nsummands
#else
call mpi_allreduce (nsummands, max_nsummands, 1, &
MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
#endif
gbl_max_red = 1
endif
! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_max")
! determine maximum shift. Shift needs to be small enough that summation
! does not exceed maximum number of digits in i8.
! if requested, return max_nsummands before it is redefined
if ( present( gbl_max_nsummands_out) ) then
gbl_max_nsummands_out = max_nsummands
endif
! summing within each thread first
max_nsummands = (max_nsummands/omp_nthreads) + 1
! then over threads and tasks
max_nsummands = max(max_nsummands, tasks*omp_nthreads)
xmax_nsummands = dble(max_nsummands)
arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
if (arr_max_shift < 2) then
write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
endif
! calculate sum
if (present(repro_sum_validate)) then
validate = repro_sum_validate
else
validate = .true.
endif
call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
nflds, arr_max_shift, arr_gmax_exp, &
arr_max_levels, max_level, validate, &
recompute, omp_nthreads, mpi_comm)
! record statistics, etc.
repro_sum_fast = 1
if (recompute) then
repro_sum_both = 1
else
! if requested, return specified levels and upper bounds on maxima
if ( present(arr_max_levels_out) ) then
do ifld=1,nflds
arr_max_levels_out(ifld) = arr_max_levels(ifld)
enddo
endif
if ( present(arr_gbl_max_out) ) then
do ifld=1,nflds
arr_gbl_max_out(ifld) = arr_gbl_max(ifld)
enddo
endif
endif
endif
endif
! do not have sufficient information; calculate global max/min and
! use to compute required number of levels
if (recompute) then
! record statistic
repro_sum_slow = 1
! determine maximum and minimum (non-zero) summand values and
! maximum number of local summands
! allocate thread-specific work space
allocate(arr_tlmax_exp(nflds,omp_nthreads))
allocate(arr_tlmin_exp(nflds,omp_nthreads))
allocate(isum_beg(omp_nthreads))
allocate(isum_end(omp_nthreads))
! split summand index range over OpenMP threads
call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
!$omp parallel do &
!$omp default(shared) &
!$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
do ithread=1,omp_nthreads
! if (detailed_timing) call xicex_timer_start('repro_sum_loopa')
do ifld=1,nflds
arr_exp_tlmin = MAXEXPONENT(1._r8)
arr_exp_tlmax = MINEXPONENT(1._r8)
do isum=isum_beg(ithread),isum_end(ithread)
if (arr(isum,ifld) .ne. 0.0_r8) then
arr_exp = exponent(arr(isum,ifld))
arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
endif
end do
arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
end do
! if (detailed_timing) call xicex_timer_stop('repro_sum_loopa')
end do
do ifld=1,nflds
arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:))
arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:))
end do
deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end)
arr_lextremes(0,:) = -nsummands
arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
! if (detailed_timing) call xicex_timer_start("repro_sum_allr_minmax")
#ifdef SERIAL_REMOVE_MPI
arr_gextremes = arr_lextremes
#else
call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
#endif
! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_minmax")
max_nsummands = -arr_gextremes(0,1)
arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
arr_gmin_exp(:) = arr_gextremes(1:nflds,2)
! if a field is identically zero, arr_gmin_exp still equals MAXEXPONENT
! and arr_gmax_exp still equals MINEXPONENT. In this case, set
! arr_gmin_exp = arr_gmax_exp = MINEXPONENT
do ifld=1,nflds
arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
enddo
! if requested, return upper bounds on observed maxima
if ( present(arr_gbl_max_out) ) then
do ifld=1,nflds
arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
enddo
endif
! if requested, return max_nsummands before it is redefined
if ( present( gbl_max_nsummands_out) ) then
gbl_max_nsummands_out = max_nsummands
endif
! determine maximum shift (same as in previous branch, but with calculated
! max_nsummands). Shift needs to be small enough that summation does not
! exceed maximum number of digits in i8.
! summing within each thread first
max_nsummands = (max_nsummands/omp_nthreads) + 1
! then over threads and tasks
max_nsummands = max(max_nsummands, tasks*omp_nthreads)
xmax_nsummands = dble(max_nsummands)
arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
if (arr_max_shift < 2) then
write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
endif
! determine maximum number of levels required for each field
! ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
! + 1 because first truncation probably does not involve a maximal shift
! + 1 to guarantee that the integer division rounds up (not down)
! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
! performance for loopb in ice_reprosum_int)
max_level = (64/nflds) + 1
do ifld=1,nflds
max_levels(ifld) = 2 + &
((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) &
/ arr_max_shift)
if ( present(arr_max_levels) .and. (.not. validate) ) then
! if validate true, then computation with arr_max_levels failed
! previously
if ( arr_max_levels(ifld) > 0 ) then
max_levels(ifld) = &
min(arr_max_levels(ifld),max_levels(ifld))
endif
endif
if (max_level < max_levels(ifld)) &
max_level = max_levels(ifld)
enddo
! if requested, return calculated levels
if ( present(arr_max_levels_out) ) then
do ifld=1,nflds
arr_max_levels_out(ifld) = max_levels(ifld)
enddo
endif
! calculate sum
validate = .false.
call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
arr_max_shift, arr_gmax_exp, max_levels, &
max_level, validate, recompute, &
omp_nthreads, mpi_comm)
endif
! if (detailed_timing) call xicex_timer_stop('ice_reprosum_int')
endif
! compare fixed and floating point results
if ( present(rel_diff) ) then
if (ice_reprosum_reldiffmax >= 0.0_r8) then
! if (detailed_timing) then
! call xicex_timer_start('xicex_nonreprosum_prebarrier')
! call xicex_mpi_barrier(mpi_comm,subname)
! call xicex_timer_stop ('xicex_nonreprosum_prebarrier')
! endif
! if (detailed_timing) call xicex_timer_start('nonrepro_sum')
! record statistic
nonrepro_sum = 1
! compute nonreproducible sum
arr_lsum(:) = 0._r8
!$omp parallel do &
!$omp default(shared) &
!$omp private(ifld, isum)
do ifld=1,nflds
do isum=1,nsummands
arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
end do
end do
#ifdef SERIAL_REMOVE_MPI
arr_gsum_fast = arr_lsum
#else
call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
MPI_REAL8, MPI_SUM, mpi_comm, ierr)
#endif
! if (detailed_timing) call xicex_timer_stop('nonrepro_sum')
! determine differences
!$omp parallel do &
!$omp default(shared) &
!$omp private(ifld, abs_diff)
do ifld=1,nflds
abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld))
if (abs(arr_gsum(ifld)) > abs_diff) then
rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld))
else
rel_diff(1,ifld) = abs_diff
endif
rel_diff(2,ifld) = abs_diff
enddo
else
rel_diff(:,:) = 0.0_r8
endif
endif
! return statistics
if ( present(repro_sum_stats) ) then
repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both
repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum
repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red
endif
end subroutine ice_reprosum_calc
!========================================================================
!
! Purpose:
!> Compute the global sum of each field in "arr" using the indicated
!> communicator with a reproducible yet scalable implementation based
!> on a fixed point algorithm. The accuracy of the fixed point algorithm
!> is controlled by the number of "levels" of integer expansion, the
!> maximum value of which is specified by max_level.
!
!----------------------------------------------------------------------
subroutine ice_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
arr_max_shift, arr_gmax_exp, max_levels, &
max_level, validate, recompute, &
omp_nthreads, mpi_comm )
!----------------------------------------------------------------------
! Arguments
integer, intent(in) :: nsummands !< number of local summands
integer, intent(in) :: dsummands !< declared first dimension
integer, intent(in) :: nflds !< number of fields
integer, intent(in) :: arr_max_shift !< maximum safe exponent for
!< value < 1 (so that sum
!< does not overflow)
integer, intent(in) :: arr_gmax_exp(nflds)
!< exponents of global maxima
integer, intent(in) :: max_levels(nflds)
!< maximum number of levels
!< of integer expansion
integer, intent(in) :: max_level !< maximum value in
!< max_levels
integer, intent(in) :: omp_nthreads !< number of OpenMP threads
integer, intent(in) :: mpi_comm !< MPI subcommunicator
real(r8), intent(in) :: arr(dsummands,nflds)
!< input array
logical, intent(in):: validate
!< flag indicating that accuracy of solution generated from
!< arr_gmax_exp and max_levels should be tested
logical, intent(out):: recompute
!< flag indicating that either the upper bounds are inaccurate,
!< or max_levels and arr_gmax_exp do not generate accurate
!< enough sums
real(r8), intent(out):: arr_gsum(nflds) !< global means
! Local workspace
integer, parameter :: max_jlevel = &
1 + (digits(0_i8)/digits(0.0_r8))
integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads)
! integer vector representing local
! sum (per thread, per field)
integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds)
! integer vector representing local
! sum
integer(i8) :: i8_arr_level ! integer part of summand for current
! expansion level
integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds)
! integer vector representing global
! sum
integer(i8) :: IX_8 ! integer representation of current
! jlevels of X_8 ('part' of
! i8_arr_gsum_level)
integer(i8) :: i8_sign ! sign global sum
integer(i8) :: i8_radix ! radix for i8 variables
integer :: max_error(nflds,omp_nthreads)
! accurate upper bound on data?
integer :: not_exact(nflds,omp_nthreads)
! max_levels sufficient to
! capture all digits?
integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
! range of summand indices for each
! OpenMP thread
integer :: ifld, isum, ithread
! loop variables
integer :: arr_exp ! exponent of summand
integer :: arr_shift ! exponent used to generate integer
! for current expansion level
integer :: ilevel ! current integer expansion level
integer :: offset(nflds) ! beginning location in
! i8_arr_{g,l}sum_level for integer
! expansion of current ifld
integer :: voffset ! modification to offset used to
! include validation metrics
integer :: ioffset ! offset(ifld)
integer :: jlevel ! number of floating point 'pieces'
! extracted from a given i8 integer
integer :: ierr ! MPI error return
integer :: LX(max_jlevel) ! exponent of X_8 (see below)
integer :: veclth ! total length of i8_arr_lsum_level
integer :: sum_digits ! lower bound on number of significant
! in integer expansion of sum
integer :: curr_exp ! exponent of partial sum during
! reconstruction from integer vector
integer :: corr_exp ! exponent of current summand in
! reconstruction from integer vector
real(r8) :: arr_frac ! fraction of summand
real(r8) :: arr_remainder ! part of summand remaining after
! current level of integer expansion
real(r8) :: X_8(max_jlevel) ! r8 vector representation of current
! i8_arr_gsum_level
real(r8) :: RX_8 ! r8 representation of difference
! between current i8_arr_gsum_level
! and current jlevels of X_8
! (== IX_8). Also used in final
! scaling step
logical :: first ! flag used to indicate that just
! beginning reconstruction of sum
! from integer vector
character(len=*),parameter :: subname = '(ice_reprosum_int)'
!-----------------------------------------------------------------------
! Save radix of i8 variables in an i8 variable
i8_radix = radix(IX_8)
! If validating upper bounds, reserve space for validation metrics
! In both cases, reserve an extra level for overflow from the top level
if (validate) then
voffset = 3
else
voffset = 1
endif
! compute offsets for each field
offset(1) = voffset
do ifld=2,nflds
offset(ifld) = offset(ifld-1) &
+ (max_levels(ifld-1) + voffset)
enddo
veclth = offset(nflds) + max_levels(nflds)
! split summand index range over OpenMP threads
call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
! convert local summands to vector of integers and sum
! (Using scale instead of set_exponent because arr_remainder may not be
! "normal" after level 1 calculation)
i8_arr_lsum_level(:) = 0_i8
!$omp parallel do &
!$omp default(shared) &
!$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, &
!$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
do ithread=1,omp_nthreads
! if (detailed_timing) call xicex_timer_start('repro_sum_loopb')
do ifld=1,nflds
ioffset = offset(ifld)
max_error(ifld,ithread) = 0
not_exact(ifld,ithread) = 0
i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
do isum=isum_beg(ithread),isum_end(ithread)
arr_remainder = 0.0_r8
if (arr(isum,ifld) .ne. 0.0_r8) then
arr_exp = exponent(arr(isum,ifld))
arr_frac = fraction(arr(isum,ifld))
! test that global maximum upper bound is an upper bound
if (arr_exp > arr_gmax_exp(ifld)) then
max_error(ifld,ithread) = 1
exit
endif
! calculate first shift
arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
! determine first (probably) nonzero level (assuming initial fraction is
! 'normal' - algorithm still works if this is not true)
! NOTE: this is critical; scale will set to zero if min exponent is too small.
if (arr_shift < 1) then
ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
do while (arr_shift < 1)
arr_shift = arr_shift + arr_max_shift
ilevel = ilevel + 1
enddo
else
ilevel = 1
endif
if (ilevel .le. max_levels(ifld)) then
! apply first shift/truncate, add it to the relevant running
! sum, and calculate the remainder.
arr_remainder = scale(arr_frac,arr_shift)
i8_arr_level = int(arr_remainder,i8)
i8_arr_tlsum_level(ilevel,ifld,ithread) = &
i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
arr_remainder = arr_remainder - i8_arr_level
! while the remainder is non-zero, continue to shift, truncate,
! sum, and calculate new remainder
do while ((arr_remainder .ne. 0.0_r8) &
.and. (ilevel < max_levels(ifld)))
ilevel = ilevel + 1
arr_remainder = scale(arr_remainder,arr_max_shift)
i8_arr_level = int(arr_remainder,i8)
i8_arr_tlsum_level(ilevel,ifld,ithread) = &
i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
arr_remainder = arr_remainder - i8_arr_level
enddo
endif
endif
if (arr_remainder .ne. 0.0_r8) then
not_exact(ifld,ithread) = 1
endif
enddo
! postprocess integer vector to eliminate potential for overlap in the following
! sums over threads and processes: if value larger than or equal to
! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in
! vector, resulting in nonoverlapping ranges for each component. Note that
! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums
! over threads and processes do not overflow for ilevel==1.
do ilevel=max_levels(ifld),1,-1
RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread)
IX_8 = int(scale(RX_8,-arr_max_shift),i8)
if (IX_8 .ne. 0_i8) then
i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
IX_8 = IX_8*(i8_radix**arr_max_shift)