From 42460d85ed1c6658b7e7d5b487a6869f804466fe Mon Sep 17 00:00:00 2001 From: Igor Sfiligoi Date: Fri, 7 Oct 2022 13:56:46 -0700 Subject: [PATCH] More aggressive vectorization of NormalizedWeighted --- src/unifrac_task.cpp | 204 ++++++++++++++++++++++--------------------- 1 file changed, 105 insertions(+), 99 deletions(-) diff --git a/src/unifrac_task.cpp b/src/unifrac_task.cpp index 57da4eb..627d3b5 100644 --- a/src/unifrac_task.cpp +++ b/src/unifrac_task.cpp @@ -640,10 +640,10 @@ static inline void NormalizedWeighted4( const uint64_t ls) { const uint32_t z_k = ((const uint32_t *)(zcheck+ks))[0]; const uint32_t z_l = ((const uint32_t *)(zcheck+ls))[0]; - const bool allzero_k = z_k==0x01010101; - const bool allzero_l1 = z_l==0x01010101; + const bool allzero_k = z_k==0x01010101; + const bool allzero_l = z_l==0x01010101; - if (allzero_k && allzero_l1) { + if (allzero_k && allzero_l) { // nothing to do, would have to add 0 } else { const TFloat sum_k0 = sums[ks]; @@ -655,19 +655,13 @@ static inline void NormalizedWeighted4( const TFloat sum_l2 = sums[ls+2]; const TFloat sum_l3 = sums[ls+3]; - // the totals can always use the distributed property - { - TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; - //TFloat *dm_stripe_total = dm_stripes_total[stripe]; - const TFloat sum_kl0 = sum_k0 + sum_l0; - const TFloat sum_kl1 = sum_k1 + sum_l1; - const TFloat sum_kl2 = sum_k2 + sum_l2; - const TFloat sum_kl3 = sum_k3 + sum_l3; - dm_stripe_total[ks] += sum_kl0; - dm_stripe_total[ks+1] += sum_kl1; - dm_stripe_total[ks+2] += sum_kl2; - dm_stripe_total[ks+3] += sum_kl3; - } + const TFloat sum_kl0 = sum_k0 + sum_l0; + const TFloat sum_kl1 = sum_k1 + sum_l1; + const TFloat sum_kl2 = sum_k2 + sum_l2; + const TFloat sum_kl3 = sum_k3 + sum_l3; + + int32_t cnt_k = 0; + int32_t cnt_l = 0; TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; //TFloat *dm_stripe = dm_stripes[stripe]; @@ -679,31 +673,52 @@ static inline void NormalizedWeighted4( dm_stripe[ks+1] += sum_l1; dm_stripe[ks+2] += sum_l2; dm_stripe[ks+3] += sum_l3; - } else if (allzero_l1) { + } else if (allzero_l) { // one side has all zeros, use distributed property // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0 dm_stripe[ks] += sum_k0; dm_stripe[ks+1] += sum_k1; dm_stripe[ks+2] += sum_k2; dm_stripe[ks+3] += sum_k3; - } else if ((z_k==0) && (z_l==0)) { - // they are all nonzero, so use the vectorized expensive path + } else { + // popcnt is cheap but has large latency, so compute early + cnt_k = popcnt_u32(z_k); + cnt_l = popcnt_u32(z_l); + } + + // the totals can always use the distributed property + { + TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; + //TFloat *dm_stripe_total = dm_stripes_total[stripe]; + dm_stripe_total[ks] += sum_kl0; + dm_stripe_total[ks+1] += sum_kl1; + dm_stripe_total[ks+2] += sum_kl2; + dm_stripe_total[ks+3] += sum_kl3; + } + + if (allzero_k||allzero_l) { + // already done + } else if ((cnt_k<3) && (cnt_l<3)) { + // several of the elemens are nonzero, may as well use the vectorized version + TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; + //TFloat *dm_stripe = dm_stripes[stripe]; + WeightedVal4(dm_stripe, embedded_proportions, lengths, filled_embs, n_samples_r, ks, ls); } else { - // both sides partially non zero, try smaller vect size - // Use UnnormalizedWeighted since we already computed dm_stripe_total - for (uint64_t i=0; i<4; i++) { - UnnormalizedWeighted1( + // only a few have both sides partially non zero, use the fine grained compute + // Use UnnormalizedWeighted since we already computed dm_stripe_total + for (uint64_t i=0; i<4; i++) { + UnnormalizedWeighted1( dm_stripes_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, ks+i, ls+i); - } + } } - } // (allzero_k && allzero_l1) + } // (allzero_k && allzero_l) } template @@ -721,10 +736,10 @@ static inline void NormalizedWeighted8( const uint64_t ls) { const uint64_t z_k = ((const uint64_t *)(zcheck+ks))[0]; const uint64_t z_l = ((const uint64_t *)(zcheck+ls))[0]; - const bool allzero_k = z_k==0x0101010101010101; - const bool allzero_l1 = z_l==0x0101010101010101; + const bool allzero_k = z_k==0x0101010101010101; + const bool allzero_l = z_l==0x0101010101010101; - if (allzero_k && allzero_l1) { + if (allzero_k && allzero_l) { // nothing to do, would have to add 0 } else { const TFloat sum_k0 = sums[ks]; @@ -744,27 +759,17 @@ static inline void NormalizedWeighted8( const TFloat sum_l6 = sums[ls+6]; const TFloat sum_l7 = sums[ls+7]; - // the totals can always use the distributed property - { - TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; - //TFloat *dm_stripe_total = dm_stripes_total[stripe]; - const TFloat sum_kl0 = sum_k0 + sum_l0; - const TFloat sum_kl1 = sum_k1 + sum_l1; - const TFloat sum_kl2 = sum_k2 + sum_l2; - const TFloat sum_kl3 = sum_k3 + sum_l3; - const TFloat sum_kl4 = sum_k4 + sum_l4; - const TFloat sum_kl5 = sum_k5 + sum_l5; - const TFloat sum_kl6 = sum_k6 + sum_l6; - const TFloat sum_kl7 = sum_k7 + sum_l7; - dm_stripe_total[ks] += sum_kl0; - dm_stripe_total[ks+1] += sum_kl1; - dm_stripe_total[ks+2] += sum_kl2; - dm_stripe_total[ks+3] += sum_kl3; - dm_stripe_total[ks+4] += sum_kl4; - dm_stripe_total[ks+5] += sum_kl5; - dm_stripe_total[ks+6] += sum_kl6; - dm_stripe_total[ks+7] += sum_kl7; - } + const TFloat sum_kl0 = sum_k0 + sum_l0; + const TFloat sum_kl1 = sum_k1 + sum_l1; + const TFloat sum_kl2 = sum_k2 + sum_l2; + const TFloat sum_kl3 = sum_k3 + sum_l3; + const TFloat sum_kl4 = sum_k4 + sum_l4; + const TFloat sum_kl5 = sum_k5 + sum_l5; + const TFloat sum_kl6 = sum_k6 + sum_l6; + const TFloat sum_kl7 = sum_k7 + sum_l7; + + int32_t cnt_k = 0; + int32_t cnt_l = 0; TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; //TFloat *dm_stripe = dm_stripes[stripe]; @@ -780,7 +785,7 @@ static inline void NormalizedWeighted8( dm_stripe[ks+5] += sum_l5; dm_stripe[ks+6] += sum_l6; dm_stripe[ks+7] += sum_l7; - } else if (allzero_l1) { + } else if (allzero_l) { // one side has all zeros, use distributed property // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0 dm_stripe[ks] += sum_k0; @@ -791,27 +796,49 @@ static inline void NormalizedWeighted8( dm_stripe[ks+5] += sum_k5; dm_stripe[ks+6] += sum_k6; dm_stripe[ks+7] += sum_k7; - } else if ((z_k==0) && (z_l==0)) { - // they are all nonzero, so use the vectorized expensive path + } else { + // popcnt is cheap but has large latency, so compute early + cnt_k = popcnt_u64(z_k); + cnt_l = popcnt_u64(z_l); + } + + // the totals can always use the distributed property + { + TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; + //TFloat *dm_stripe_total = dm_stripes_total[stripe]; + dm_stripe_total[ks] += sum_kl0; + dm_stripe_total[ks+1] += sum_kl1; + dm_stripe_total[ks+2] += sum_kl2; + dm_stripe_total[ks+3] += sum_kl3; + dm_stripe_total[ks+4] += sum_kl4; + dm_stripe_total[ks+5] += sum_kl5; + dm_stripe_total[ks+6] += sum_kl6; + dm_stripe_total[ks+7] += sum_kl7; + } + + if (allzero_k||allzero_l) { + // already done + } else if ((cnt_k<6) && (cnt_l<6)) { + // several of the elemens are nonzero, may as well use the vectorized version + TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; + //TFloat *dm_stripe = dm_stripes[stripe]; + WeightedVal8(dm_stripe, embedded_proportions, lengths, filled_embs, n_samples_r, ks, ls); } else { - // both sides partially non zero, try smaller vect size - // Use UnnormalizedWeighted since we already computed dm_stripe_total - UnnormalizedWeighted4( + // only a few have both sides partially non zero, use the fine grained compute + // Use UnnormalizedWeighted since we already computed dm_stripe_total + for (uint64_t i=0; i<8; i++) { + UnnormalizedWeighted1( dm_stripes_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, - ks, ls); - UnnormalizedWeighted4( - dm_stripes_buf, - zcheck, sums, embedded_proportions, lengths, - filled_embs,idx, n_samples_r, - ks+4, ls+4); + ks+i, ls+i); + } } - } // (allzero_k && allzero_l1) + } // (allzero_k && allzero_l) } #endif @@ -867,60 +894,39 @@ void SUCMP_NM::UnifracNormalizedWeightedTask::_run(unsigned int filled_e #else // SIMD-based CPUs need help with vectorization const uint64_t idx = (stripe-start_idx) * n_samples_r; - uint64_t ik = 0; - - while( ik < (step_size-7) ) { - const uint64_t ks = sk*step_size + ik; - const uint64_t ke = ks+7; - - if (ke>=n_samples) break; // past the limit - - const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound - const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound - - if ((le-ls)!=7) break; //nor contiguous, use serial version - ik+=8; + uint64_t ks = sk*step_size; + const uint64_t kmax = std::min(ks+step_size,n_samples); + uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound + while( ((ks+8) <= kmax) && ((n_samples-ls)>=8) ) { NormalizedWeighted8( dm_stripes_buf,dm_stripes_total_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, ks, ls); - } // for ik - - while( ik < (step_size-3) ) { - const uint64_t ks = sk*step_size + ik; - const uint64_t ke = ks+3; - - if (ke>=n_samples) break; // past the limit - - const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound - const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound - - if ((le-ls)!=3) break; //nor contiguous, use serial version - ik+=4; + ks+=8; + ls = (ls + 8)%n_samples; // wraparound + } // for ks+=8 + while( ((ks+4) <= kmax) && ((n_samples-ls)>=4) ) { NormalizedWeighted4( dm_stripes_buf,dm_stripes_total_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, ks, ls); - } // for ik - // deal with any leftovers in serial way - while( ik < step_size ) { - const uint64_t k = sk*step_size + ik; - - if (k>=n_samples) break; // past the limit - ik++; - - const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound + ks+=4; + ls = (ls + 4)%n_samples; // wraparound + } // for ks+=4 + // deal with any leftovers in serial way + for( ; ks < kmax; ks++ ) { NormalizedWeighted1( dm_stripes_buf,dm_stripes_total_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, - k, l1); - } // for ik + ks, ls); + ls = (ls + 1)%n_samples; // wraparound + } // for ks #endif } // for stripe } // for sk