Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add precomputed sums for Unweighted + bug fix #13

Merged
merged 7 commits into from
Oct 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
279 changes: 156 additions & 123 deletions src/unifrac_task.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,35 +52,6 @@ static inline void WeightedZerosAndSums(
}
}

// check for zero values
template<class T>
static inline void WeightedZerosAsync(
bool * const __restrict__ zcheck,
const T * const __restrict__ embedded_proportions,
const unsigned int filled_embs,
const uint64_t n_samples,
const uint64_t n_samples_r) {
#ifdef _OPENACC
#pragma acc parallel loop gang vector present(embedded_proportions,zcheck) async
#else
#pragma omp parallel for default(shared)
#endif
for(uint64_t k=0; k<n_samples; k++) {
bool all_zeros=true;

#pragma acc loop seq
for (uint64_t emb=0; emb<filled_embs; emb++) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the bug in the previous PR.
Should have been filled_embs_els_round, as it is now in line 1181.

const uint64_t offset = n_samples_r * emb;

T u1 = embedded_proportions[offset + k];
all_zeros = all_zeros && (u1==0);
}

zcheck[k] = all_zeros;
}
}


// Single step in computing Weighted part of Unifrac
template<class TFloat>
static inline TFloat WeightedVal1(
Expand Down Expand Up @@ -1193,28 +1164,131 @@ void SUCMP_NM::UnifracVawGeneralizedTask<TFloat>::_run(unsigned int filled_embs,
#endif
}

// Single step in computing NormalizedWeighted Unifrac
// Single step in computing Unweighted Unifrac

template<class TFloat>
static inline void UnweightedOneSide(
bool * const __restrict__ zcheck,
TFloat * const __restrict__ stripe_sums,
const TFloat * const __restrict__ sums,
const uint64_t * const __restrict__ embedded_proportions,
const unsigned int filled_embs_els_round,
const uint64_t n_samples_r,
const uint64_t kl) {
bool all_zeros=true;
TFloat my_stripe = 0.0;

for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
const uint64_t offset = n_samples_r * emb_el;
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);

uint64_t o1 = embedded_proportions[offset + kl];

if (o1==0) { // zeros are prevalent
// nothing to do
} else {
all_zeros = false;
#ifndef _OPENACC
// CPU/SIMD faster if we check for partial compute
if (((uint32_t)o1)==0) {
// only high part relevant
//uint64_t x1 = u1 ^ v1;
// With one of the two being 0, xor is just the non-negative one

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

TFloat esum = psum[0x400+((uint8_t)(o1 >> 32))] +
psum[0x500+((uint8_t)(o1 >> 40))] +
psum[0x600+((uint8_t)(o1 >> 48))] +
psum[0x700+((uint8_t)(o1 >> 56))];
my_stripe += esum;
} else if ((o1>>32)==0) {
// only low part relevant
//uint64_t x1 = u1 ^ v1;
// With one of the two being 0, xor is just the non-negative one

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

TFloat esum = psum[ (uint8_t)(o1) ] +
psum[0x100+((uint8_t)(o1 >> 8))] +
psum[0x200+((uint8_t)(o1 >> 16))] +
psum[0x300+((uint8_t)(o1 >> 24))];
my_stripe += esum;
} else {
#else
// GPU/SIMT faster if we just go ahead will full at all times
{
#endif
//uint64_t x1 = u1 ^ v1;
// With one of the two being 0, xor is just the non-negative one

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

TFloat esum = psum[ (uint8_t)(o1) ] +
psum[0x100+((uint8_t)(o1 >> 8))] +
psum[0x200+((uint8_t)(o1 >> 16))] +
psum[0x300+((uint8_t)(o1 >> 24))] +
psum[0x400+((uint8_t)(o1 >> 32))] +
psum[0x500+((uint8_t)(o1 >> 40))] +
psum[0x600+((uint8_t)(o1 >> 48))] +
psum[0x700+((uint8_t)(o1 >> 56))];
my_stripe += esum;
}
}
}

zcheck[kl] = all_zeros;
stripe_sums[kl] = my_stripe;

}

#ifdef _OPENACC

template<class TFloat>
static inline void Unweighted1(
TFloat * const __restrict__ dm_stripes_buf,
TFloat * const __restrict__ dm_stripes_total_buf,
const bool * const __restrict__ zcheck,
const TFloat * const __restrict__ stripe_sums,
const TFloat * const __restrict__ sums,
const uint64_t * const __restrict__ embedded_proportions,
const unsigned int filled_embs_els_round,
const uint64_t idx,
const uint64_t n_samples_r,
const uint64_t k,
const uint64_t l1) {
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];
//TFloat *dm_stripe_total = dm_stripes_total[stripe];
const bool allzero_k = zcheck[k];
const bool allzero_l1 = zcheck[l1];

bool did_update = false;
TFloat my_stripe = 0.0;
TFloat my_stripe_total = 0.0;
if (allzero_k && allzero_l1) {
// nothing to do, would have to add 0
} else {
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];
//TFloat *dm_stripe_total = dm_stripes_total[stripe];

bool did_update = false;
TFloat my_stripe = 0.0;
TFloat my_stripe_total = 0.0;

if (allzero_k || allzero_l1) {
// with one side zero, | and ^ are no-ops
const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero onea
my_stripe = stripe_sums[kl];
did_update = (my_stripe!=0.0);
my_stripe_total = my_stripe;
} else {
// we need both sides
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
const uint64_t offset = n_samples_r * emb_el;
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
Expand Down Expand Up @@ -1250,19 +1324,23 @@ static inline void Unweighted1(
psum[0x700+((uint8_t)(x1 >> 56))];
}
}
}

if (did_update) {
dm_stripe[k] += my_stripe;
dm_stripe_total[k] += my_stripe_total;
}
if (did_update) {
dm_stripe[k] += my_stripe;
dm_stripe_total[k] += my_stripe_total;
}
} // (allzero_k && allzero_l1)

}
#else

template<class TFloat>
static inline void Unweighted1(
TFloat * const __restrict__ dm_stripes_buf,
TFloat * const __restrict__ dm_stripes_total_buf,
const bool * const __restrict__ zcheck,
const TFloat * const __restrict__ stripe_sums,
const TFloat * const __restrict__ sums,
const uint64_t * const __restrict__ embedded_proportions,
const unsigned int filled_embs_els_round,
Expand All @@ -1287,75 +1365,10 @@ static inline void Unweighted1(

if (allzero_k || allzero_l1) {
// with one side zero, | and ^ are no-ops
const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero one
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
const uint64_t offset = n_samples_r * emb_el;
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);

//uint64_t u1 = embedded_proportions[offset + k];
//uint64_t v1 = embedded_proportions[offset + l1];
//uint64_t o1 = u1 | v1;
// With one of the two being 0, or is just the non-negative one
uint64_t o1 = embedded_proportions[offset + kl];

if (o1==0) { // zeros are prevalent
// nothing to do
} else if (((uint32_t)o1)==0) {
// only high part relevant
did_update=true;
//uint64_t x1 = u1 ^ v1;
// With one of the two being 0, xor is just the non-negative one

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

TFloat esum = psum[0x400+((uint8_t)(o1 >> 32))] +
psum[0x500+((uint8_t)(o1 >> 40))] +
psum[0x600+((uint8_t)(o1 >> 48))] +
psum[0x700+((uint8_t)(o1 >> 56))];
my_stripe_total += esum;
my_stripe += esum;
} else if ((o1>>32)==0) {
// only low part relevant
did_update=true;
//uint64_t x1 = u1 ^ v1;
// With one of the two being 0, xor is just the non-negative one

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

TFloat esum = psum[ (uint8_t)(o1) ] +
psum[0x100+((uint8_t)(o1 >> 8))] +
psum[0x200+((uint8_t)(o1 >> 16))] +
psum[0x300+((uint8_t)(o1 >> 24))];
my_stripe_total += esum;
my_stripe += esum;
} else {
did_update=true;
//uint64_t x1 = u1 ^ v1;
// With one of the two being 0, xor is just the non-negative one

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

TFloat esum = psum[ (uint8_t)(o1) ] +
psum[0x100+((uint8_t)(o1 >> 8))] +
psum[0x200+((uint8_t)(o1 >> 16))] +
psum[0x300+((uint8_t)(o1 >> 24))] +
psum[0x400+((uint8_t)(o1 >> 32))] +
psum[0x500+((uint8_t)(o1 >> 40))] +
psum[0x600+((uint8_t)(o1 >> 48))] +
psum[0x700+((uint8_t)(o1 >> 56))];
my_stripe_total += esum;
my_stripe += esum;
}
}
const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero onea
my_stripe = stripe_sums[kl];
my_stripe_total = my_stripe;
did_update = (my_stripe!=0.0);
} else {
// we need both sides
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
Expand Down Expand Up @@ -1441,6 +1454,30 @@ static inline void Unweighted1(
}
#endif

// check for zero values
template<class TFloat, class T>
static inline void UnweightedZerosAndSums(
bool * const __restrict__ zcheck,
TFloat * const __restrict__ stripe_sums,
const TFloat * const __restrict__ el_sums,
const T * const __restrict__ embedded_proportions,
const unsigned int filled_embs_els_round,
const uint64_t n_samples,
const uint64_t n_samples_r) {
#ifdef _OPENACC
#pragma acc parallel loop gang vector present(embedded_proportions,zcheck,el_sums,stripe_sums)
#else
#pragma omp parallel for default(shared)
#endif
for(uint64_t k=0; k<n_samples; k++) {
UnweightedOneSide(
zcheck, stripe_sums,
el_sums, embedded_proportions,
filled_embs_els_round, n_samples_r,
k);
}
}

template<class TFloat>
void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
const uint64_t start_idx = this->task_p->start;
Expand All @@ -1454,6 +1491,8 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
TFloat * const __restrict__ dm_stripes_total_buf = this->dm_stripes_total.buf;

TFloat * const __restrict__ sums = this->sums;
bool * const __restrict__ zcheck = this->zcheck;
TFloat * const __restrict__ stripe_sums = this->stripe_sums;

const uint64_t step_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::step_size;
const uint64_t sample_steps = (n_samples+(step_size-1))/step_size; // round up
Expand All @@ -1463,15 +1502,6 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con

const uint64_t filled_embs_els_round = (filled_embs+63)/64;

#ifndef _OPENACC
// not effective for GPUs, but helps a lot on the CPUs
bool * const __restrict__ zcheck = this->zcheck;
// check for zero values
WeightedZerosAsync(zcheck,
embedded_proportions,
filled_embs_els_round, n_samples, n_samples_r);
#endif

// pre-compute sums of length elements, since they are likely to be accessed many times
// We will use a 8-bit map, to keep it small enough to keep in L1 cache
#ifdef _OPENACC
Expand Down Expand Up @@ -1527,9 +1557,14 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
}
}

#pragma acc wait
// check for zero values and compute stripe sums
UnweightedZerosAndSums(zcheck, stripe_sums,
sums, embedded_proportions,
filled_embs_els_round, n_samples, n_samples_r);

// point of thread
#ifdef _OPENACC
#pragma acc wait
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::acc_vector_size;
#pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
#else
Expand All @@ -1548,9 +1583,7 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con

Unweighted1<TFloat>(
dm_stripes_buf,dm_stripes_total_buf,
#ifndef _OPENACC
zcheck,
#endif
zcheck, stripe_sums,
sums, embedded_proportions,
filled_embs_els_round,idx, n_samples_r,
k, l1);
Expand Down
Loading