From 933388f5d11ec3582d81d4df48b650131cde8b17 Mon Sep 17 00:00:00 2001
From: Marshall Ward <marshall.ward@noaa.gov>
Date: Tue, 30 Jan 2024 15:28:29 -0500
Subject: [PATCH] Cuberoot: Refactor (re|de)scale functions

Some modifications were made to the cuberoot rescale and descale
functions:

* The machine parameters were moved from function to module parameters.
  This could dangerously expose them to other functions, but it prevents
  multiple definitions of the same numbers.

* The exponent is now cube-roote in rescale rather than descale, and the
  fact that the rescaled exponent is always -1 is now integrated into
  the expressions, rather than read and appended to the old value.

* The exponent expressions are broken into more explicit steps, rather
  than combining multiple steps and assumptions into a single
  expression.  We are counting on the compiler to simplify these!

* The bias is no longer assumed to be a multiple of three!  This is true
  for double precision but not single precision.

* The bias is removed (and later restored) to the exponent terms before
  computing the new exponents after cuberoot.

Although this reduces the number of bit-reading operations by one, I did
not see any appreciable change in performance.  We probably have more
work to do than simple tweaks to bit operations.

A new test of quasi-random number was also added to the cuberoot test
suite.  These numbers were able to detect the differences in GNU and
Intel compiler output.

Co-authored-by: Robert Hallberg <Robert.Hallberg@noaa.gov>
---
 src/framework/MOM_intrinsic_functions.F90 | 129 +++++++++++-----------
 1 file changed, 62 insertions(+), 67 deletions(-)

diff --git a/src/framework/MOM_intrinsic_functions.F90 b/src/framework/MOM_intrinsic_functions.F90
index 5d420057d4..16eac5539a 100644
--- a/src/framework/MOM_intrinsic_functions.F90
+++ b/src/framework/MOM_intrinsic_functions.F90
@@ -12,6 +12,19 @@ module MOM_intrinsic_functions
 public :: invcosh, cuberoot
 public :: intrinsic_functions_unit_tests
 
+! Floating point model, if bit layout from high to low is (sign, exp, frac)
+
+integer, parameter :: bias = maxexponent(1.) - 1
+  !< The double precision exponent offset
+integer, parameter :: signbit = storage_size(1.) - 1
+  !< Position of sign bit
+integer, parameter :: explen = 1 + ceiling(log(real(bias))/log(2.))
+  !< Bit size of exponent
+integer, parameter :: expbit = signbit - explen
+  !< Position of lowest exponent bit
+integer, parameter :: fraclen = expbit
+  !< Length of fractional part
+
 contains
 
 !> Evaluate the inverse cosh, either using a math library or an
@@ -55,7 +68,7 @@ elemental function cuberoot(x) result(root)
     ! Return 0 for an input of 0, or NaN for a NaN input.
     root = x
   else
-    call rescale_exp(x, asx, e_x, s_x)
+    call rescale_cbrt(x, asx, e_x, s_x)
 
     !   Iteratively determine root_asx = asx**1/3 using Halley's method and then Newton's method,
     ! noting that Halley's method onverges monotonically and needs no bounding.  Halley's method is
@@ -83,100 +96,75 @@ elemental function cuberoot(x) result(root)
     ! that is within the last bit of the true solution.
     root_asx = root_asx - (root_asx**3 - asx) / (3.0 * (root_asx**2))
 
-    root = descale_cbrt(root_asx, e_x, s_x)
+    root = descale(root_asx, e_x, s_x)
   endif
 end function cuberoot
 
 
-!> Rescale `a` to the range [0.125, 1) while preserving its fractional term.
-pure subroutine rescale_exp(a, x, e_a, s_a)
+!> Rescale `a` to the range [0.125, 1) and compute its cube-root exponent.
+pure subroutine rescale_cbrt(a, x, e_c, s_a)
   real, intent(in) :: a
-    !< The value to be rescaled
+    !< The real parameter to be rescaled for cube root
   real, intent(out) :: x
     !< The rescaled value of `a`
-  integer(kind=int64), intent(out) :: e_a
-    !< The biased exponent of `a`
+  integer(kind=int64), intent(out) :: e_c
+    !< The cuberoot-exponent of `a`
   integer(kind=int64), intent(out) :: s_a
     !< The sign bit of `a`
 
-  ! Floating point model, if format is (sign, exp, frac)
-  integer, parameter :: bias = maxexponent(1.) - 1
-    !< The double precision exponent offset (assuming a balanced range)
-  integer, parameter :: signbit = storage_size(1.) - 1
-    !< Position of sign bit
-  integer, parameter :: explen = 1 + ceiling(log(real(bias))/log(2.))
-    !< Bit size of exponent
-  integer, parameter :: expbit = signbit - explen
-    !< Position of lowest exponent bit
-  integer, parameter :: fraclen = expbit
-    !< Length of fractional part
-
   integer(kind=int64) :: xb
-    !< A floating point number, bit-packed as an integer
-  integer(kind=int64) :: e_scaled
-    !< The new rescaled exponent of `a` (i.e. the exponent of `x`)
-
-  ! Pack bits of `a` into `xb` and extract its exponent and sign
+    ! Floating point value of `a`, bit-packed as an integer
+  integer(kind=int64) :: e_a
+    ! Unscaled exponent of `a`
+  integer(kind=int64) :: e_x
+    ! Exponent of `x` (i.e. exponent of `a` after rescaling)
+  integer(kind=int64) :: e_div, e_mod
+    ! Quotient and remainder of `e = 3*(e/3) + e % 3`.
+
+  ! Pack bits of `a` into `xb` and extract its exponent and sign.
   xb = transfer(a, 1_int64)
   s_a = ibits(xb, signbit, 1)
-  e_a = ibits(xb, expbit, explen)
+  e_a = ibits(xb, expbit, explen) - bias
+
+  ! Decompose the exponent as `e = 3*(e//3) + modulo(e,3)`.
+  ! Fortran division is round-to-zero so we must reconstruct integer divison.
+  e_mod = modulo(e_a, 3_int64)
+  e_div = (e_a - e_mod)/3
 
-  ! Decompose the exponent as `e = modulo(e,3) + 3*(e/3)` and extract the
-  ! rescaled exponent, now in {-3,-2,-1}
-  e_scaled = modulo(e_a, 3) - 3 + bias
+  ! 1. Compute the exponent of final cube root of `a`.
+  !   * `e = e_a/3 + 1 + cbrt(e_x)` but `cbrt(x)` exponent is always -1.
+  e_c = (e_div + 1) - 1
 
-  ! Insert the new 11-bit exponent into `xb`, while also setting the sign bit
-  ! to zero, ensuring that `xb` is always positive.
-  call mvbits(e_scaled, 0, explen + 1, xb, fraclen)
+  ! 2. Construct the rescaled exponent, shifted to [0.125, 1).
+  e_x = e_mod - 3
 
-  ! Transfer the final modified value to `x`
+  ! Insert the new 11-bit exponent into `xb` and write to `x`, extending the
+  ! bitcount to 12, so that the sign bit is zero and `xb` is always positive.
+  call mvbits(e_x + bias, 0, explen + 1, xb, fraclen)
   x = transfer(xb, 1.)
-end subroutine rescale_exp
+end subroutine rescale_cbrt
 
 
-!> Descale a real number to its original base, and apply the cube root to the
-!! remaining exponent.
-pure function descale_cbrt(x, e_a, s_a) result(r)
+!> Undo the rescaling of a real number back to its original base.
+pure function descale(x, e_a, s_a) result(a)
   real, intent(in) :: x
-    !< Cube root of the rescaled value, which was rescaled to [0.125, 1.0)
+    !< The rescaled value which is to be restored.
   integer(kind=int64), intent(in) :: e_a
-    !< Exponent of the original value to be cube rooted
+    !< Exponent of the unscaled value
   integer(kind=int64), intent(in) :: s_a
-    !< Sign bit of the original value to be cube rooted
-  real :: r
-    !< Restored value with the cube root applied to its exponent
-
-  ! Floating point model, if format is (sign, exp, frac)
-  integer, parameter :: bias = maxexponent(1.) - 1
-    !< The double precision exponent offset (assuming a balanced range)
-  integer, parameter :: signbit = storage_size(1.) - 1
-    !< Position of sign bit
-  integer, parameter :: explen = 1 + ceiling(log(real(bias))/log(2.))
-    !< Bit size of exponent
-  integer, parameter :: expbit = signbit - explen
-    !< Position of lowest exponent bit
-  integer, parameter :: fraclen = expbit
-    !< Length of fractional part
+    !< Sign bit of the unscaled value
+  real :: a
+    !< Restored value with the corrected exponent and sign
 
   integer(kind=int64) :: xb
     ! Bit-packed real number into integer form
-  integer(kind=int64) :: e_r
-    ! Exponent of the descaled value
 
-  ! Extract the exponent of the rescaled value, in {-3, -2, -1}
+  ! Apply the corrected exponent and sign to `x`.
   xb = transfer(x, 1_8)
-  e_r = ibits(xb, expbit, explen)
-
-  ! Apply the cube root to the old exponent (after removing its bias) and add
-  ! to the rescaled exponent.  Correct the previous -3 with a +1.
-  e_r = e_r + (e_a/3 - bias/3 + 1)
-
-  ! Apply the corrected exponent and sign and convert back to real
-  call mvbits(e_r, 0, explen, xb, expbit)
+  call mvbits(e_a + bias, 0, explen, xb, expbit)
   call mvbits(s_a, 0, 1, xb, signbit)
-  r = transfer(xb, 1.)
-end function descale_cbrt
-
+  a = transfer(xb, 1.)
+end function descale
 
 
 !> Returns true if any unit test of intrinsic_functions fails, or false if they all pass.
@@ -186,6 +174,7 @@ logical function intrinsic_functions_unit_tests(verbose)
   ! Local variables
   real :: testval  ! A test value for self-consistency testing [nondim]
   logical :: fail, v
+  integer :: n
 
   fail = .false.
   v = verbose
@@ -200,6 +189,12 @@ logical function intrinsic_functions_unit_tests(verbose)
   fail = fail .or. Test_cuberoot(v, 0.125)
   fail = fail .or. Test_cuberoot(v, 0.965)
 
+  testval = 1.0e-99
+  v = .false.
+  do n=-160,160
+    fail = fail .or. Test_cuberoot(v, testval)
+    testval = (-2.908 * (1.414213562373 + 1.2345678901234e-5*n)) * testval
+  enddo
 end function intrinsic_functions_unit_tests
 
 !> True if the cube of cuberoot(val) does not closely match val. False otherwise.