From 42460d85ed1c6658b7e7d5b487a6869f804466fe Mon Sep 17 00:00:00 2001
From: Igor Sfiligoi <isfiligoi@sdsc.edu>
Date: Fri, 7 Oct 2022 13:56:46 -0700
Subject: [PATCH] More aggressive vectorization of NormalizedWeighted

---
 src/unifrac_task.cpp | 204 ++++++++++++++++++++++---------------------
 1 file changed, 105 insertions(+), 99 deletions(-)

diff --git a/src/unifrac_task.cpp b/src/unifrac_task.cpp
index 57da4eb..627d3b5 100644
--- a/src/unifrac_task.cpp
+++ b/src/unifrac_task.cpp
@@ -640,10 +640,10 @@ static inline void NormalizedWeighted4(
                       const uint64_t ls) {
        const uint32_t z_k = ((const uint32_t *)(zcheck+ks))[0];
        const uint32_t z_l = ((const uint32_t *)(zcheck+ls))[0];
-       const bool allzero_k =  z_k==0x01010101;
-       const bool allzero_l1 = z_l==0x01010101;
+       const bool allzero_k = z_k==0x01010101;
+       const bool allzero_l = z_l==0x01010101;
 
-       if (allzero_k && allzero_l1) {
+       if (allzero_k && allzero_l) {
          // nothing to do, would have to add 0
        } else {
           const TFloat sum_k0 = sums[ks];
@@ -655,19 +655,13 @@ static inline void NormalizedWeighted4(
           const TFloat sum_l2 = sums[ls+2];
           const TFloat sum_l3 = sums[ls+3];
 
-          // the totals can always use the distributed property
-          {
-             TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
-             //TFloat *dm_stripe_total = dm_stripes_total[stripe];
-             const TFloat sum_kl0 = sum_k0 + sum_l0;
-             const TFloat sum_kl1 = sum_k1 + sum_l1;
-             const TFloat sum_kl2 = sum_k2 + sum_l2;
-             const TFloat sum_kl3 = sum_k3 + sum_l3;
-             dm_stripe_total[ks]   += sum_kl0;
-             dm_stripe_total[ks+1] += sum_kl1;
-             dm_stripe_total[ks+2] += sum_kl2;
-             dm_stripe_total[ks+3] += sum_kl3;
-          }
+          const TFloat sum_kl0 = sum_k0 + sum_l0;
+          const TFloat sum_kl1 = sum_k1 + sum_l1;
+          const TFloat sum_kl2 = sum_k2 + sum_l2;
+          const TFloat sum_kl3 = sum_k3 + sum_l3;
+
+          int32_t cnt_k = 0;
+          int32_t cnt_l = 0;
 
           TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
           //TFloat *dm_stripe = dm_stripes[stripe];
@@ -679,31 +673,52 @@ static inline void NormalizedWeighted4(
              dm_stripe[ks+1] += sum_l1;
              dm_stripe[ks+2] += sum_l2;
              dm_stripe[ks+3] += sum_l3;
-          } else if (allzero_l1) {
+          } else if (allzero_l) {
              // one side has all zeros, use distributed property
              // if (nonzero_k)  ridx=k  // fabs(k-l1), with l1==0
              dm_stripe[ks]   += sum_k0;
              dm_stripe[ks+1] += sum_k1;
              dm_stripe[ks+2] += sum_k2;
              dm_stripe[ks+3] += sum_k3;
-          } else if ((z_k==0) && (z_l==0)) {
-             // they are all nonzero, so use the vectorized expensive path
+          } else {
+             // popcnt is cheap but has large latency, so compute early
+             cnt_k = popcnt_u32(z_k);
+             cnt_l = popcnt_u32(z_l);
+          }
+
+          // the totals can always use the distributed property
+          {
+             TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+             //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+             dm_stripe_total[ks]   += sum_kl0;
+             dm_stripe_total[ks+1] += sum_kl1;
+             dm_stripe_total[ks+2] += sum_kl2;
+             dm_stripe_total[ks+3] += sum_kl3;
+          }
+
+          if (allzero_k||allzero_l) {
+             // already done
+          } else if ((cnt_k<3) && (cnt_l<3)) {
+             // several of the elemens are nonzero, may as well use the vectorized version
+             TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+             //TFloat *dm_stripe = dm_stripes[stripe];
+
              WeightedVal4(dm_stripe,
                        embedded_proportions, lengths,
                        filled_embs, n_samples_r,
                        ks, ls);
           } else {
-            // both sides partially non zero, try smaller vect size
-            // Use UnnormalizedWeighted since we already computed dm_stripe_total
-            for (uint64_t i=0; i<4; i++) {
-               UnnormalizedWeighted1<TFloat>(
+             // only a few have both sides partially non zero, use the fine grained compute
+             // Use UnnormalizedWeighted since we already computed dm_stripe_total
+             for (uint64_t i=0; i<4; i++) {
+                UnnormalizedWeighted1<TFloat>(
                                    dm_stripes_buf,
                                    zcheck, sums, embedded_proportions, lengths,
                                    filled_embs,idx, n_samples_r,
                                    ks+i, ls+i);
-            }
+             }
           }
-       } // (allzero_k && allzero_l1)
+       } // (allzero_k && allzero_l)
 }
 
 template<class TFloat>
@@ -721,10 +736,10 @@ static inline void NormalizedWeighted8(
                       const uint64_t ls) {
        const uint64_t z_k = ((const uint64_t *)(zcheck+ks))[0];
        const uint64_t z_l = ((const uint64_t *)(zcheck+ls))[0];
-       const bool allzero_k =  z_k==0x0101010101010101;
-       const bool allzero_l1 = z_l==0x0101010101010101;
+       const bool allzero_k = z_k==0x0101010101010101;
+       const bool allzero_l = z_l==0x0101010101010101;
 
-       if (allzero_k && allzero_l1) {
+       if (allzero_k && allzero_l) {
          // nothing to do, would have to add 0
        } else {
           const TFloat sum_k0 = sums[ks];
@@ -744,27 +759,17 @@ static inline void NormalizedWeighted8(
           const TFloat sum_l6 = sums[ls+6];
           const TFloat sum_l7 = sums[ls+7];
 
-          // the totals can always use the distributed property
-          {
-             TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
-             //TFloat *dm_stripe_total = dm_stripes_total[stripe];
-             const TFloat sum_kl0 = sum_k0 + sum_l0;
-             const TFloat sum_kl1 = sum_k1 + sum_l1;
-             const TFloat sum_kl2 = sum_k2 + sum_l2;
-             const TFloat sum_kl3 = sum_k3 + sum_l3;
-             const TFloat sum_kl4 = sum_k4 + sum_l4;
-             const TFloat sum_kl5 = sum_k5 + sum_l5;
-             const TFloat sum_kl6 = sum_k6 + sum_l6;
-             const TFloat sum_kl7 = sum_k7 + sum_l7;
-             dm_stripe_total[ks]   += sum_kl0;
-             dm_stripe_total[ks+1] += sum_kl1;
-             dm_stripe_total[ks+2] += sum_kl2;
-             dm_stripe_total[ks+3] += sum_kl3;
-             dm_stripe_total[ks+4] += sum_kl4;
-             dm_stripe_total[ks+5] += sum_kl5;
-             dm_stripe_total[ks+6] += sum_kl6;
-             dm_stripe_total[ks+7] += sum_kl7;
-          }
+          const TFloat sum_kl0 = sum_k0 + sum_l0;
+          const TFloat sum_kl1 = sum_k1 + sum_l1;
+          const TFloat sum_kl2 = sum_k2 + sum_l2;
+          const TFloat sum_kl3 = sum_k3 + sum_l3;
+          const TFloat sum_kl4 = sum_k4 + sum_l4;
+          const TFloat sum_kl5 = sum_k5 + sum_l5;
+          const TFloat sum_kl6 = sum_k6 + sum_l6;
+          const TFloat sum_kl7 = sum_k7 + sum_l7;
+
+          int32_t cnt_k = 0;
+          int32_t cnt_l = 0;
 
           TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
           //TFloat *dm_stripe = dm_stripes[stripe];
@@ -780,7 +785,7 @@ static inline void NormalizedWeighted8(
              dm_stripe[ks+5] += sum_l5;
              dm_stripe[ks+6] += sum_l6;
              dm_stripe[ks+7] += sum_l7;
-          } else if (allzero_l1) {
+          } else if (allzero_l) {
              // one side has all zeros, use distributed property
              // if (nonzero_k)  ridx=k  // fabs(k-l1), with l1==0
              dm_stripe[ks]   += sum_k0;
@@ -791,27 +796,49 @@ static inline void NormalizedWeighted8(
              dm_stripe[ks+5] += sum_k5;
              dm_stripe[ks+6] += sum_k6;
              dm_stripe[ks+7] += sum_k7;
-          } else if ((z_k==0) && (z_l==0)) {
-             // they are all nonzero, so use the vectorized expensive path
+          } else {
+             // popcnt is cheap but has large latency, so compute early
+             cnt_k = popcnt_u64(z_k);
+             cnt_l = popcnt_u64(z_l);
+          }
+
+          // the totals can always use the distributed property
+          {
+             TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+             //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+             dm_stripe_total[ks]   += sum_kl0;
+             dm_stripe_total[ks+1] += sum_kl1;
+             dm_stripe_total[ks+2] += sum_kl2;
+             dm_stripe_total[ks+3] += sum_kl3;
+             dm_stripe_total[ks+4] += sum_kl4;
+             dm_stripe_total[ks+5] += sum_kl5;
+             dm_stripe_total[ks+6] += sum_kl6;
+             dm_stripe_total[ks+7] += sum_kl7;
+          }
+
+          if (allzero_k||allzero_l) {
+             // already done
+          } else if ((cnt_k<6) && (cnt_l<6)) {
+             // several of the elemens are nonzero, may as well use the vectorized version
+             TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+             //TFloat *dm_stripe = dm_stripes[stripe];
+
              WeightedVal8(dm_stripe,
                        embedded_proportions, lengths,
                        filled_embs, n_samples_r,
                        ks, ls);
           } else {
-            // both sides partially non zero, try smaller vect size
-            // Use UnnormalizedWeighted since we already computed dm_stripe_total
-            UnnormalizedWeighted4<TFloat>(
+             // only a few have both sides partially non zero, use the fine grained compute
+             // Use UnnormalizedWeighted since we already computed dm_stripe_total
+             for (uint64_t i=0; i<8; i++) {
+                UnnormalizedWeighted1<TFloat>(
                                    dm_stripes_buf,
                                    zcheck, sums, embedded_proportions, lengths,
                                    filled_embs,idx, n_samples_r,
-                                   ks, ls);
-            UnnormalizedWeighted4<TFloat>(
-                                   dm_stripes_buf,
-                                   zcheck, sums, embedded_proportions, lengths,
-                                   filled_embs,idx, n_samples_r,
-                                   ks+4, ls+4);
+                                   ks+i, ls+i);
+             }
           }
-       } // (allzero_k && allzero_l1)
+       } // (allzero_k && allzero_l)
 }
 #endif
 
@@ -867,60 +894,39 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
 #else
       // SIMD-based CPUs need help with vectorization
       const uint64_t idx = (stripe-start_idx) * n_samples_r;
-      uint64_t ik = 0;
-
-      while( ik < (step_size-7) ) {
-       const uint64_t ks = sk*step_size + ik;
-       const uint64_t ke = ks+7;
-
-       if (ke>=n_samples) break; // past the limit
-
-       const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound
-       const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound
-
-       if ((le-ls)!=7) break; //nor contiguous, use serial version
-       ik+=8;
+      uint64_t ks = sk*step_size;
+      const uint64_t kmax = std::min(ks+step_size,n_samples);
+      uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound
 
+      while( ((ks+8) <= kmax) && ((n_samples-ls)>=8) ) {
        NormalizedWeighted8<TFloat>(
                                    dm_stripes_buf,dm_stripes_total_buf,
                                    zcheck, sums, embedded_proportions, lengths,
                                    filled_embs,idx, n_samples_r,
                                    ks, ls);
-      } // for ik
-
-      while( ik < (step_size-3) ) {
-       const uint64_t ks = sk*step_size + ik;
-       const uint64_t ke = ks+3;
-
-       if (ke>=n_samples) break; // past the limit
-
-       const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound
-       const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound
-
-       if ((le-ls)!=3) break; //nor contiguous, use serial version
-       ik+=4;
+       ks+=8;
+       ls = (ls + 8)%n_samples; // wraparound
+      } // for ks+=8
 
+      while( ((ks+4) <= kmax) && ((n_samples-ls)>=4) ) {
        NormalizedWeighted4<TFloat>(
                                    dm_stripes_buf,dm_stripes_total_buf,
                                    zcheck, sums, embedded_proportions, lengths,
                                    filled_embs,idx, n_samples_r,
                                    ks, ls);
-      } // for ik
-      // deal with any leftovers in serial way
-      while( ik < step_size ) {
-       const uint64_t k = sk*step_size + ik;
-
-       if (k>=n_samples) break; // past the limit
-       ik++;
-
-       const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
+       ks+=4;
+       ls = (ls + 4)%n_samples; // wraparound
+      } // for ks+=4
 
+      // deal with any leftovers in serial way
+      for( ; ks < kmax; ks++ ) {
        NormalizedWeighted1<TFloat>(
                                    dm_stripes_buf,dm_stripes_total_buf,
                                    zcheck, sums, embedded_proportions, lengths,
                                    filled_embs,idx, n_samples_r,
-                                   k, l1);
-      } // for ik
+                                   ks, ls);
+       ls = (ls + 1)%n_samples; // wraparound
+      } // for ks
 #endif
      } // for stripe
     } // for sk