Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization of the construction of SFC #1227

Merged
merged 1 commit into from
Aug 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
267 changes: 159 additions & 108 deletions Src/Base/AMReX_DistributionMapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -888,51 +888,157 @@ namespace
bool operator () (const SFCToken& lhs,
const SFCToken& rhs) const;
};

SFCToken (int box, const IntVect& idx, Real vol)
:
m_box(box), m_idx(idx), m_vol(vol) {}

int m_box;
IntVect m_idx;
Real m_vol;

static int MaxPower;
int m_box;
Array<uint32_t,AMREX_SPACEDIM> m_morton;
};
}

int SFCToken::MaxPower = 64;

AMREX_FORCE_INLINE
bool
SFCToken::Compare::operator () (const SFCToken& lhs,
const SFCToken& rhs) const
{
for (int i = SFCToken::MaxPower - 1; i >= 0; --i)
#if (AMREX_SPACEDIM == 1)
return lhs.m_morton[0] < rhs.m_morton[0];
#elif (AMREX_SPACEDIM == 2)
return (lhs.m_morton[1] < rhs.m_morton[1]) ||
((lhs.m_morton[1] == rhs.m_morton[1]) &&
(lhs.m_morton[0] < rhs.m_morton[0]));
#else
return (lhs.m_morton[2] < rhs.m_morton[2]) ||
((lhs.m_morton[2] == rhs.m_morton[2]) &&
((lhs.m_morton[1] < rhs.m_morton[1]) ||
((lhs.m_morton[1] == rhs.m_morton[1]) &&
(lhs.m_morton[0] < rhs.m_morton[0]))));
#endif
}

namespace {
#if (AMREX_SPACEDIM == 3)
AMREX_FORCE_INLINE
uint32_t make_space (uint32_t x)
{
const int N = (1<<i);
// x : 0000,0000,0000,0000,0000,00a9,8765,4321
x = (x | (x << 16)) & 0x030000FF;
// x << 16 : 0000,00a9,8765,4321,0000,0000,0000,0000
// x | (x << 16): 0000,00a9,8765,4321,0000,00a9,8765,4321
// 0x030000FF : 0000,0011,0000,0000,0000,0000,1111,1111
// x : 0000,00a9,0000,0000,0000,0000,8765,4321
x = (x | (x << 8)) & 0x0300F00F;
// x << 8 : 0000,0000,0000,0000,8765,4321,0000,0000
// x | (x << 8) : 0000,00a9,0000,0000,8765,4321,8765,4321
// 0x0300F00F : 0000,0011,0000,0000,1111,0000,0000,1111
// x : 0000,00a9,0000,0000,8765,0000,0000,4321
x = (x | (x << 4)) & 0x030C30C3;
// x << 4 : 00a9,0000,0000,8765,0000,0000,4321,0000
// x | (x << 4) : 00a9,00a9,0000,8765,8765,0000,4321,4321
// 0x030C30C3 : 0000,0011,0000,1100,0011,0000,1100,0011
// x : 0000,00a9,0000,8700,0065,0000,4300,0021
x = (x | (x << 2)) & 0x09249249;
// x << 2 : 0000,a900,0087,0000,6500,0043,0000,2100
// x | (x << 2) : 0000,a9a9,0087,8700,6565,0043,4300,2121
// 0x09249249 : 0000,1001,0010,0100,1001,0010,0100,1001
// x : 0000,a009,0080,0700,6005,0040,0300,2001
return x;
}
#elif (AMREX_SPACEDIM == 2)
AMREX_FORCE_INLINE
uint32_t make_space (uint32_t x)
{
// x : 0000,0000,0000,0000,gfed,cba9,8765,4321
x = (x | (x << 8)) & 0x00FF00FF;
// x << 8 : 0000,0000,gfed,cba9,8765,4321,0000,0000
// x | (x << 8): 0000,0000,gfed,cba9,????,????,8765,4321
// 0x00FF00FF : 0000,0000,1111,1111,0000,0000,1111,1111
// x : 0000,0000,gfed,cba9,0000,0000,8765,4321
x = (x | (x << 4)) & 0x0F0F0F0F;
// x << 4 : 0000,gfed,cba9,0000,0000,8765,4321,0000
// x | (x << 4): 0000,gfed,????,cba9,0000,8765,????,4321
// 0x0F0F0F0F : 0000,1111,0000,1111,0000,1111,0000,1111
// x : 0000,gfed,0000,cba9,0000,8765,0000,4321
x = (x | (x << 2)) & 0x33333333;
// x << 2 : 00gf,ed00,00cb,a900,0087,6500,0043,2100
// x | (x << 2): 00gf,??ed,00cb,??a9,0087,??65,0043,??21
// 0x33333333 : 0011,0011,0011,0011,0011,0011,0011,0011
// x : 00gf,00ed,00cb,00a9,0087,0065,0043,0021
x = (x | (x << 1)) & 0x55555555;
// x << 1 : 0gf0,0ed0,0cb0,0a90,0870,0650,0430,0210
// x | (x << 1): 0g?f,0e?d,0c?b,0a?9,08?7,06?5,04?3,02?1
// 0x55555555 : 0101,0101,0101,0101,0101,0101,0101,0101
// x : 0g0f,0e0d,0c0b,0a09,0807,0605,0403,0201
return x;
}
#endif

for (int j = AMREX_SPACEDIM-1; j >= 0; --j)
{
const int il = lhs.m_idx[j]/N;
const int ir = rhs.m_idx[j]/N;
AMREX_FORCE_INLINE
SFCToken makeSFCToken (int box_index, IntVect const& iv)
{
SFCToken token;
token.m_box = box_index;

#if (AMREX_SPACEDIM == 3)

constexpr int imin = -static_cast<int>(1 << 29);
AMREX_ASSERT_WITH_MESSAGE(AMREX_D_TERM(iv[0] >= imin && iv[0] < -imin,
&& iv[1] >= imin && iv[1] < -imin,
&& iv[2] >= imin && iv[2] < -imin),
"SFCToken: index out of range");
uint32_t x = iv[0] - imin;
uint32_t y = iv[1] - imin;
uint32_t z = iv[2] - imin;
// extract lowest 10 bits and make space for interleaving
token.m_morton[0] = make_space(x & 0x3FF)
| (make_space(y & 0x3FF) << 1)
| (make_space(z & 0x3FF) << 2);
x = x >> 10;
y = y >> 10;
z = z >> 10;
token.m_morton[1] = make_space(x & 0x3FF)
| (make_space(y & 0x3FF) << 1)
| (make_space(z & 0x3FF) << 2);
x = x >> 10;
y = y >> 10;
z = z >> 10;
token.m_morton[2] = make_space(x & 0x3FF)
| (make_space(y & 0x3FF) << 1)
| (make_space(z & 0x3FF) << 2);

#elif (AMREX_SPACEDIM == 2)

constexpr uint32_t offset = 1 << 31;
static_assert(static_cast<uint32_t>(std::numeric_limits<int>::max())+1 == offset,
"INT_MAX != (1<<31)-1");
uint32_t x = (iv[0] >= 0) ? static_cast<uint32_t>(iv[0]) + offset
: static_cast<uint32_t>(iv[0]-std::numeric_limits<int>::lowest());
uint32_t y = (iv[1] >= 0) ? static_cast<uint32_t>(iv[1]) + offset
: static_cast<uint32_t>(iv[1]-std::numeric_limits<int>::lowest());
// extract lowest 16 bits and make sapce for interleaving
token.m_morton[0] = make_space(x & 0xFFFF)
| (make_space(y & 0xFFFF) << 1);
x = x >> 16;
y = y >> 16;
token.m_morton[1] = make_space(x) | (make_space(y) << 1);

#elif (AMREX_SPACEDIM == 1)

constexpr uint32_t offset = 1 << 31;
static_assert(static_cast<uint32_t>(std::numeric_limits<int>::max())+1 == offset,
"INT_MAX != (1<<31)-1");
token.m_morton[0] = (iv[0] >= 0) ? static_cast<uint32_t>(iv[0]) + offset
: static_cast<uint32_t>(iv[0]-std::numeric_limits<int>::lowest());

if (il < ir)
{
return true;
}
else if (il > ir)
{
return false;
}
}
#else
static_assert(false,"AMREX_SPACEDIM != 1, 2 or 3");
#endif

return token;
}
return false;
}

static
void
Distribute (const std::vector<SFCToken>& tokens,
const std::vector<Long>& wgts,
int nprocs,
Real volpercpu,
std::vector< std::vector<int> >& v)
Expand All @@ -948,8 +1054,7 @@ Distribute (const std::vector<SFCToken>& tokens,
for (const auto &t : tokens) {
Print() << " " << idx++ << ": "
<< t.m_box << ": "
<< t.m_idx << ": "
<< t.m_vol << std::endl;
<< t.m_morton << std::endl;
}
}

Expand All @@ -967,7 +1072,7 @@ Distribute (const std::vector<SFCToken>& tokens,
K < TSZ && (i == (nprocs-1) || (vol < volpercpu));
++K)
{
vol += tokens[K].m_vol;
vol += wgts[tokens[K].m_box];
++cnt;

v[i].push_back(tokens[K].m_box);
Expand All @@ -981,7 +1086,7 @@ Distribute (const std::vector<SFCToken>& tokens,
{
--K;
v[i].pop_back();
totalvol -= tokens[K].m_vol;
totalvol -= wgts[tokens[K].m_box];
}
}

Expand All @@ -997,9 +1102,8 @@ Distribute (const std::vector<SFCToken>& tokens,
BL_ASSERT(box == t.m_box);
Print() << " " << idx << ": "
<< t.m_box << ": "
<< t.m_idx << ": "
<< t.m_vol << std::endl;
rank_vol += t.m_vol;
<< t.m_morton << std::endl;
rank_vol += wgts[t.m_box];
idx++;
}
Print() << " Total Rank Vol: " << rank_vol << std::endl;
Expand Down Expand Up @@ -1051,49 +1155,30 @@ DistributionMapping::SFCProcessorMapDoIt (const BoxArray& boxes,
<< nprocs << ", " << nteams << ", " << nworkers << ")\n";
}

std::vector<SFCToken> tokens;

const int N = boxes.size();

std::vector<SFCToken> tokens;
tokens.reserve(N);

int maxijk = 0;

for (int i = 0; i < N; ++i)
{
const Box& bx = boxes[i];
tokens.push_back(SFCToken(i,bx.smallEnd(),wgts[i]));

const SFCToken& token = tokens.back();

AMREX_D_TERM(maxijk = std::max(maxijk, token.m_idx[0]);,
maxijk = std::max(maxijk, token.m_idx[1]);,
maxijk = std::max(maxijk, token.m_idx[2]););
const Box& bx = boxes[i];
tokens.push_back(makeSFCToken(i, bx.smallEnd()));
}
//
// Set SFCToken::MaxPower for BoxArray.
//
int m = 0;
for ( ; (1 << m) <= maxijk; ++m) {
; // do nothing
}
SFCToken::MaxPower = m;
//
// Put'm in Morton space filling curve order.
//
std::sort(tokens.begin(), tokens.end(), SFCToken::Compare());
//
// Split'm up as equitably as possible per team.
//
Real volperteam = 0;
for (const SFCToken& tok : tokens) {
volperteam += tok.m_vol;
for (Long wt : wgts) {
volperteam += wt;
}
volperteam /= nteams;

std::vector< std::vector<int> > vec(nteams);

Distribute(tokens,nteams,volperteam,vec);
Distribute(tokens,wgts,nteams,volperteam,vec);

// vec has a size of nteams and vec[] holds a vector of box ids.

Expand Down Expand Up @@ -1315,33 +1400,14 @@ DistributionMapping::RRSFCDoIt (const BoxArray& boxes,
amrex::Abort("Team support is not implemented yet in RRSFC");
#endif

std::vector<SFCToken> tokens;

const int nboxes = boxes.size();

std::vector<SFCToken> tokens;
tokens.reserve(nboxes);

int maxijk = 0;

for (int i = 0; i < nboxes; ++i)
{
const Box& bx = boxes[i];
tokens.push_back(SFCToken(i,bx.smallEnd(),0.0));

const SFCToken& token = tokens.back();

AMREX_D_TERM(maxijk = std::max(maxijk, token.m_idx[0]);,
maxijk = std::max(maxijk, token.m_idx[1]);,
maxijk = std::max(maxijk, token.m_idx[2]););
}
//
// Set SFCToken::MaxPower for BoxArray.
//
int m = 0;
for ( ; (1 << m) <= maxijk; ++m) {
; // do nothing
const Box& bx = boxes[i];
tokens.push_back(makeSFCToken(i, bx.smallEnd()));
}
SFCToken::MaxPower = m;
//
// Put'm in Morton space filling curve order.
//
Expand Down Expand Up @@ -1731,37 +1797,21 @@ DistributionMapping::makeSFC (const BoxArray& ba, bool use_box_vol, const int np
{
BL_PROFILE("makeSFC");

std::vector<SFCToken> tokens;

const int N = ba.size();

std::vector<SFCToken> tokens;
std::vector<Long> wgts;
tokens.reserve(N);

int maxijk = 0;

Real vol_sum = 0;
wgts.reserve(N);
Long vol_sum = 0;
for (int i = 0; i < N; ++i)
{
const Box& bx = ba[i];
const auto & bx_vol = (use_box_vol ? bx.volume() : 1);
tokens.push_back(SFCToken(i,bx.smallEnd(),bx_vol));
vol_sum += bx_vol;

const SFCToken& token = tokens.back();

AMREX_D_TERM(maxijk = std::max(maxijk, token.m_idx[0]);,
maxijk = std::max(maxijk, token.m_idx[1]);,
maxijk = std::max(maxijk, token.m_idx[2]););
const Box& bx = ba[i];
tokens.push_back(makeSFCToken(i, bx.smallEnd()));
const Long v = use_box_vol ? bx.volume() : Long(1);
vol_sum += v;
wgts.push_back(v);
}
//
// Set SFCToken::MaxPower for BoxArray.
//
int m = 0;
for ( ; (1 << m) <= maxijk; ++m) {
; // do nothing
}
SFCToken::MaxPower = m;
//
// Put'm in Morton space filling curve order.
//
std::sort(tokens.begin(), tokens.end(), SFCToken::Compare());
Expand All @@ -1770,7 +1820,8 @@ DistributionMapping::makeSFC (const BoxArray& ba, bool use_box_vol, const int np
volper = vol_sum / nprocs;

std::vector< std::vector<int> > r(nprocs);
Distribute(tokens, nprocs, volper, r);

Distribute(tokens, wgts, nprocs, volper, r);

return r;
}
Expand Down
9 changes: 9 additions & 0 deletions Src/Base/AMReX_Print.H
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@

namespace amrex
{
template <typename T>
std::ostream& operator<< (std::ostream& os, Array<T,AMREX_SPACEDIM> const& a)
{
os << AMREX_D_TERM( '(' << a[0] , <<
',' << a[1] , <<
',' << a[2]) << ')';
return os;
}

template <typename T, typename S>
std::ostream& operator<<(std::ostream& os, const std::pair<T, S>& v)
{
Expand Down