Skip to content

Commit

Permalink
More aggressive vectorization of NormalizedWeighted
Browse files Browse the repository at this point in the history
  • Loading branch information
sfiligoi committed Oct 7, 2022
1 parent fcaa3a3 commit 42460d8
Showing 1 changed file with 105 additions and 99 deletions.
204 changes: 105 additions & 99 deletions src/unifrac_task.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -640,10 +640,10 @@ static inline void NormalizedWeighted4(
const uint64_t ls) {
const uint32_t z_k = ((const uint32_t *)(zcheck+ks))[0];
const uint32_t z_l = ((const uint32_t *)(zcheck+ls))[0];
const bool allzero_k = z_k==0x01010101;
const bool allzero_l1 = z_l==0x01010101;
const bool allzero_k = z_k==0x01010101;
const bool allzero_l = z_l==0x01010101;

if (allzero_k && allzero_l1) {
if (allzero_k && allzero_l) {
// nothing to do, would have to add 0
} else {
const TFloat sum_k0 = sums[ks];
Expand All @@ -655,19 +655,13 @@ static inline void NormalizedWeighted4(
const TFloat sum_l2 = sums[ls+2];
const TFloat sum_l3 = sums[ls+3];

// the totals can always use the distributed property
{
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
//TFloat *dm_stripe_total = dm_stripes_total[stripe];
const TFloat sum_kl0 = sum_k0 + sum_l0;
const TFloat sum_kl1 = sum_k1 + sum_l1;
const TFloat sum_kl2 = sum_k2 + sum_l2;
const TFloat sum_kl3 = sum_k3 + sum_l3;
dm_stripe_total[ks] += sum_kl0;
dm_stripe_total[ks+1] += sum_kl1;
dm_stripe_total[ks+2] += sum_kl2;
dm_stripe_total[ks+3] += sum_kl3;
}
const TFloat sum_kl0 = sum_k0 + sum_l0;
const TFloat sum_kl1 = sum_k1 + sum_l1;
const TFloat sum_kl2 = sum_k2 + sum_l2;
const TFloat sum_kl3 = sum_k3 + sum_l3;

int32_t cnt_k = 0;
int32_t cnt_l = 0;

TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];
Expand All @@ -679,31 +673,52 @@ static inline void NormalizedWeighted4(
dm_stripe[ks+1] += sum_l1;
dm_stripe[ks+2] += sum_l2;
dm_stripe[ks+3] += sum_l3;
} else if (allzero_l1) {
} else if (allzero_l) {
// one side has all zeros, use distributed property
// if (nonzero_k) ridx=k // fabs(k-l1), with l1==0
dm_stripe[ks] += sum_k0;
dm_stripe[ks+1] += sum_k1;
dm_stripe[ks+2] += sum_k2;
dm_stripe[ks+3] += sum_k3;
} else if ((z_k==0) && (z_l==0)) {
// they are all nonzero, so use the vectorized expensive path
} else {
// popcnt is cheap but has large latency, so compute early
cnt_k = popcnt_u32(z_k);
cnt_l = popcnt_u32(z_l);
}

// the totals can always use the distributed property
{
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
//TFloat *dm_stripe_total = dm_stripes_total[stripe];
dm_stripe_total[ks] += sum_kl0;
dm_stripe_total[ks+1] += sum_kl1;
dm_stripe_total[ks+2] += sum_kl2;
dm_stripe_total[ks+3] += sum_kl3;
}

if (allzero_k||allzero_l) {
// already done
} else if ((cnt_k<3) && (cnt_l<3)) {
// several of the elemens are nonzero, may as well use the vectorized version
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];

WeightedVal4(dm_stripe,
embedded_proportions, lengths,
filled_embs, n_samples_r,
ks, ls);
} else {
// both sides partially non zero, try smaller vect size
// Use UnnormalizedWeighted since we already computed dm_stripe_total
for (uint64_t i=0; i<4; i++) {
UnnormalizedWeighted1<TFloat>(
// only a few have both sides partially non zero, use the fine grained compute
// Use UnnormalizedWeighted since we already computed dm_stripe_total
for (uint64_t i=0; i<4; i++) {
UnnormalizedWeighted1<TFloat>(
dm_stripes_buf,
zcheck, sums, embedded_proportions, lengths,
filled_embs,idx, n_samples_r,
ks+i, ls+i);
}
}
}
} // (allzero_k && allzero_l1)
} // (allzero_k && allzero_l)
}

template<class TFloat>
Expand All @@ -721,10 +736,10 @@ static inline void NormalizedWeighted8(
const uint64_t ls) {
const uint64_t z_k = ((const uint64_t *)(zcheck+ks))[0];
const uint64_t z_l = ((const uint64_t *)(zcheck+ls))[0];
const bool allzero_k = z_k==0x0101010101010101;
const bool allzero_l1 = z_l==0x0101010101010101;
const bool allzero_k = z_k==0x0101010101010101;
const bool allzero_l = z_l==0x0101010101010101;

if (allzero_k && allzero_l1) {
if (allzero_k && allzero_l) {
// nothing to do, would have to add 0
} else {
const TFloat sum_k0 = sums[ks];
Expand All @@ -744,27 +759,17 @@ static inline void NormalizedWeighted8(
const TFloat sum_l6 = sums[ls+6];
const TFloat sum_l7 = sums[ls+7];

// the totals can always use the distributed property
{
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
//TFloat *dm_stripe_total = dm_stripes_total[stripe];
const TFloat sum_kl0 = sum_k0 + sum_l0;
const TFloat sum_kl1 = sum_k1 + sum_l1;
const TFloat sum_kl2 = sum_k2 + sum_l2;
const TFloat sum_kl3 = sum_k3 + sum_l3;
const TFloat sum_kl4 = sum_k4 + sum_l4;
const TFloat sum_kl5 = sum_k5 + sum_l5;
const TFloat sum_kl6 = sum_k6 + sum_l6;
const TFloat sum_kl7 = sum_k7 + sum_l7;
dm_stripe_total[ks] += sum_kl0;
dm_stripe_total[ks+1] += sum_kl1;
dm_stripe_total[ks+2] += sum_kl2;
dm_stripe_total[ks+3] += sum_kl3;
dm_stripe_total[ks+4] += sum_kl4;
dm_stripe_total[ks+5] += sum_kl5;
dm_stripe_total[ks+6] += sum_kl6;
dm_stripe_total[ks+7] += sum_kl7;
}
const TFloat sum_kl0 = sum_k0 + sum_l0;
const TFloat sum_kl1 = sum_k1 + sum_l1;
const TFloat sum_kl2 = sum_k2 + sum_l2;
const TFloat sum_kl3 = sum_k3 + sum_l3;
const TFloat sum_kl4 = sum_k4 + sum_l4;
const TFloat sum_kl5 = sum_k5 + sum_l5;
const TFloat sum_kl6 = sum_k6 + sum_l6;
const TFloat sum_kl7 = sum_k7 + sum_l7;

int32_t cnt_k = 0;
int32_t cnt_l = 0;

TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];
Expand All @@ -780,7 +785,7 @@ static inline void NormalizedWeighted8(
dm_stripe[ks+5] += sum_l5;
dm_stripe[ks+6] += sum_l6;
dm_stripe[ks+7] += sum_l7;
} else if (allzero_l1) {
} else if (allzero_l) {
// one side has all zeros, use distributed property
// if (nonzero_k) ridx=k // fabs(k-l1), with l1==0
dm_stripe[ks] += sum_k0;
Expand All @@ -791,27 +796,49 @@ static inline void NormalizedWeighted8(
dm_stripe[ks+5] += sum_k5;
dm_stripe[ks+6] += sum_k6;
dm_stripe[ks+7] += sum_k7;
} else if ((z_k==0) && (z_l==0)) {
// they are all nonzero, so use the vectorized expensive path
} else {
// popcnt is cheap but has large latency, so compute early
cnt_k = popcnt_u64(z_k);
cnt_l = popcnt_u64(z_l);
}

// the totals can always use the distributed property
{
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
//TFloat *dm_stripe_total = dm_stripes_total[stripe];
dm_stripe_total[ks] += sum_kl0;
dm_stripe_total[ks+1] += sum_kl1;
dm_stripe_total[ks+2] += sum_kl2;
dm_stripe_total[ks+3] += sum_kl3;
dm_stripe_total[ks+4] += sum_kl4;
dm_stripe_total[ks+5] += sum_kl5;
dm_stripe_total[ks+6] += sum_kl6;
dm_stripe_total[ks+7] += sum_kl7;
}

if (allzero_k||allzero_l) {
// already done
} else if ((cnt_k<6) && (cnt_l<6)) {
// several of the elemens are nonzero, may as well use the vectorized version
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];

WeightedVal8(dm_stripe,
embedded_proportions, lengths,
filled_embs, n_samples_r,
ks, ls);
} else {
// both sides partially non zero, try smaller vect size
// Use UnnormalizedWeighted since we already computed dm_stripe_total
UnnormalizedWeighted4<TFloat>(
// only a few have both sides partially non zero, use the fine grained compute
// Use UnnormalizedWeighted since we already computed dm_stripe_total
for (uint64_t i=0; i<8; i++) {
UnnormalizedWeighted1<TFloat>(
dm_stripes_buf,
zcheck, sums, embedded_proportions, lengths,
filled_embs,idx, n_samples_r,
ks, ls);
UnnormalizedWeighted4<TFloat>(
dm_stripes_buf,
zcheck, sums, embedded_proportions, lengths,
filled_embs,idx, n_samples_r,
ks+4, ls+4);
ks+i, ls+i);
}
}
} // (allzero_k && allzero_l1)
} // (allzero_k && allzero_l)
}
#endif

Expand Down Expand Up @@ -867,60 +894,39 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
#else
// SIMD-based CPUs need help with vectorization
const uint64_t idx = (stripe-start_idx) * n_samples_r;
uint64_t ik = 0;

while( ik < (step_size-7) ) {
const uint64_t ks = sk*step_size + ik;
const uint64_t ke = ks+7;

if (ke>=n_samples) break; // past the limit

const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound
const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound

if ((le-ls)!=7) break; //nor contiguous, use serial version
ik+=8;
uint64_t ks = sk*step_size;
const uint64_t kmax = std::min(ks+step_size,n_samples);
uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound

while( ((ks+8) <= kmax) && ((n_samples-ls)>=8) ) {
NormalizedWeighted8<TFloat>(
dm_stripes_buf,dm_stripes_total_buf,
zcheck, sums, embedded_proportions, lengths,
filled_embs,idx, n_samples_r,
ks, ls);
} // for ik

while( ik < (step_size-3) ) {
const uint64_t ks = sk*step_size + ik;
const uint64_t ke = ks+3;

if (ke>=n_samples) break; // past the limit

const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound
const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound

if ((le-ls)!=3) break; //nor contiguous, use serial version
ik+=4;
ks+=8;
ls = (ls + 8)%n_samples; // wraparound
} // for ks+=8

while( ((ks+4) <= kmax) && ((n_samples-ls)>=4) ) {
NormalizedWeighted4<TFloat>(
dm_stripes_buf,dm_stripes_total_buf,
zcheck, sums, embedded_proportions, lengths,
filled_embs,idx, n_samples_r,
ks, ls);
} // for ik
// deal with any leftovers in serial way
while( ik < step_size ) {
const uint64_t k = sk*step_size + ik;

if (k>=n_samples) break; // past the limit
ik++;

const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
ks+=4;
ls = (ls + 4)%n_samples; // wraparound
} // for ks+=4

// deal with any leftovers in serial way
for( ; ks < kmax; ks++ ) {
NormalizedWeighted1<TFloat>(
dm_stripes_buf,dm_stripes_total_buf,
zcheck, sums, embedded_proportions, lengths,
filled_embs,idx, n_samples_r,
k, l1);
} // for ik
ks, ls);
ls = (ls + 1)%n_samples; // wraparound
} // for ks
#endif
} // for stripe
} // for sk
Expand Down

0 comments on commit 42460d8

Please sign in to comment.