Skip to content

Commit

Permalink
Only do a htod memcpy in getParticleTileData when necessary (AMReX-Co…
Browse files Browse the repository at this point in the history
…des#3760)

## Summary

Previously a htod memcopy was done every time `getParticleTileData()` or
`getConstParticleTileData()` was called if runtime components where
allocated. Now it will only be done if the runtime component pointers
have changed, for example after resize or after ReorderParticles.

## Additional background

## Checklist

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
  • Loading branch information
AlexanderSinn authored Feb 15, 2024
1 parent 8b476a9 commit f692e78
Showing 1 changed file with 64 additions and 46 deletions.
110 changes: 64 additions & 46 deletions Src/Particle/AMReX_ParticleTile.H
Original file line number Diff line number Diff line change
Expand Up @@ -1109,35 +1109,41 @@ struct ParticleTile

ParticleTileDataType getParticleTileData ()
{
int index = NArrayReal;
m_runtime_r_ptrs.resize(m_soa_tile.NumRealComps() - NArrayReal);
m_runtime_i_ptrs.resize(m_soa_tile.NumIntComps() - NArrayInt);
#ifdef AMREX_USE_GPU
Gpu::HostVector<ParticleReal*> h_runtime_r_ptrs(m_runtime_r_ptrs.size());
for (auto& r_ptr : h_runtime_r_ptrs) {
r_ptr = m_soa_tile.GetRealData(index++).dataPtr();
}
if (h_runtime_r_ptrs.size() > 0) {
Gpu::htod_memcpy_async(m_runtime_r_ptrs.data(), h_runtime_r_ptrs.data(),
h_runtime_r_ptrs.size()*sizeof(ParticleReal*));
bool copy_real = false;
m_h_runtime_r_ptrs.resize(m_soa_tile.NumRealComps() - NArrayReal);
for (std::size_t i = 0; i < m_h_runtime_r_ptrs.size(); ++i) {
if (m_h_runtime_r_ptrs[i] != m_soa_tile.GetRealData(i + NArrayReal).dataPtr()) {
m_h_runtime_r_ptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr();
copy_real = true;
}
}
#else
for (auto& r_ptr : m_runtime_r_ptrs) {
r_ptr = m_soa_tile.GetRealData(index++).dataPtr();
if (copy_real) {
Gpu::htod_memcpy_async(m_runtime_r_ptrs.data(), m_h_runtime_r_ptrs.data(),
m_h_runtime_r_ptrs.size()*sizeof(ParticleReal*));
}
#endif

index = NArrayInt;
#ifdef AMREX_USE_GPU
Gpu::HostVector<int*> h_runtime_i_ptrs(m_runtime_i_ptrs.size());
for (auto& i_ptr : h_runtime_i_ptrs) {
i_ptr = m_soa_tile.GetIntData(index++).dataPtr();
bool copy_int = false;
m_h_runtime_i_ptrs.resize(m_soa_tile.NumIntComps() - NArrayInt);
for (std::size_t i = 0; i < m_h_runtime_i_ptrs.size(); ++i) {
if (m_h_runtime_i_ptrs[i] != m_soa_tile.GetIntData(i + NArrayInt).dataPtr()) {
m_h_runtime_i_ptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr();
copy_int = true;
}
}
if (h_runtime_i_ptrs.size() > 0) {
Gpu::htod_memcpy_async(m_runtime_i_ptrs.data(), h_runtime_i_ptrs.data(),
h_runtime_i_ptrs.size()*sizeof(int*));
if (copy_int) {
Gpu::htod_memcpy_async(m_runtime_i_ptrs.data(), m_h_runtime_i_ptrs.data(),
m_h_runtime_i_ptrs.size()*sizeof(int*));
}
#else
for (auto& i_ptr : m_runtime_i_ptrs) {
i_ptr = m_soa_tile.GetIntData(index++).dataPtr();
for (std::size_t i = 0; i < m_runtime_r_ptrs.size(); ++i) {
m_runtime_r_ptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr();
}

for (std::size_t i = 0; i < m_runtime_i_ptrs.size(); ++i) {
m_runtime_i_ptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr();
}
#endif

Expand Down Expand Up @@ -1169,7 +1175,7 @@ struct ParticleTile
ptd.m_runtime_idata = m_runtime_i_ptrs.dataPtr();

#ifdef AMREX_USE_GPU
if ((h_runtime_r_ptrs.size() > 0) || (h_runtime_i_ptrs.size() > 0)) {
if (copy_real || copy_int) {
Gpu::streamSynchronize();
}
#endif
Expand All @@ -1179,35 +1185,41 @@ struct ParticleTile

ConstParticleTileDataType getConstParticleTileData () const
{
int index = NArrayReal;
m_runtime_r_cptrs.resize(m_soa_tile.NumRealComps() - NArrayReal);
m_runtime_i_cptrs.resize(m_soa_tile.NumIntComps() - NArrayInt);
#ifdef AMREX_USE_GPU
Gpu::HostVector<ParticleReal const*> h_runtime_r_cptrs(m_runtime_r_cptrs.size());
for (auto& r_ptr : h_runtime_r_cptrs) {
r_ptr = m_soa_tile.GetRealData(index++).dataPtr();
}
if (h_runtime_r_cptrs.size() > 0) {
Gpu::htod_memcpy_async(m_runtime_r_cptrs.data(), h_runtime_r_cptrs.data(),
h_runtime_r_cptrs.size()*sizeof(ParticleReal const*));
bool copy_real = false;
m_h_runtime_r_cptrs.resize(m_soa_tile.NumRealComps() - NArrayReal);
for (std::size_t i = 0; i < m_h_runtime_r_cptrs.size(); ++i) {
if (m_h_runtime_r_cptrs[i] != m_soa_tile.GetRealData(i + NArrayReal).dataPtr()) {
m_h_runtime_r_cptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr();
copy_real = true;
}
}
#else
for (auto& r_ptr : m_runtime_r_cptrs) {
r_ptr = m_soa_tile.GetRealData(index++).dataPtr();
if (copy_real) {
Gpu::htod_memcpy_async(m_runtime_r_cptrs.data(), m_h_runtime_r_cptrs.data(),
m_h_runtime_r_cptrs.size()*sizeof(ParticleReal*));
}
#endif

index = NArrayInt;
#ifdef AMREX_USE_GPU
Gpu::HostVector<int const*> h_runtime_i_cptrs(m_runtime_i_cptrs.size());
for (auto& i_ptr : h_runtime_i_cptrs) {
i_ptr = m_soa_tile.GetIntData(index++).dataPtr();
bool copy_int = false;
m_h_runtime_i_cptrs.resize(m_soa_tile.NumIntComps() - NArrayInt);
for (std::size_t i = 0; i < m_h_runtime_i_cptrs.size(); ++i) {
if (m_h_runtime_i_cptrs[i] != m_soa_tile.GetIntData(i + NArrayInt).dataPtr()) {
m_h_runtime_i_cptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr();
copy_int = true;
}
}
if (h_runtime_i_cptrs.size() > 0) {
Gpu::htod_memcpy_async(m_runtime_i_cptrs.data(), h_runtime_i_cptrs.data(),
h_runtime_i_cptrs.size()*sizeof(int const*));
if (copy_int) {
Gpu::htod_memcpy_async(m_runtime_i_cptrs.data(), m_h_runtime_i_cptrs.data(),
m_h_runtime_i_cptrs.size()*sizeof(int*));
}
#else
for (auto& i_ptr : m_runtime_i_cptrs) {
i_ptr = m_soa_tile.GetIntData(index++).dataPtr();
for (std::size_t i = 0; i < m_runtime_r_cptrs.size(); ++i) {
m_runtime_r_cptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr();
}

for (std::size_t i = 0; i < m_runtime_i_cptrs.size(); ++i) {
m_runtime_i_cptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr();
}
#endif

Expand Down Expand Up @@ -1239,7 +1251,7 @@ struct ParticleTile
ptd.m_runtime_idata = m_runtime_i_cptrs.dataPtr();

#ifdef AMREX_USE_GPU
if ((h_runtime_r_cptrs.size() > 0) || (h_runtime_i_cptrs.size() > 0)) {
if (copy_real || copy_int) {
Gpu::streamSynchronize();
}
#endif
Expand All @@ -1259,6 +1271,12 @@ private:

mutable amrex::PODVector<const ParticleReal*, Allocator<const ParticleReal*> > m_runtime_r_cptrs;
mutable amrex::PODVector<const int*, Allocator<const int*> >m_runtime_i_cptrs;

amrex::Gpu::HostVector<ParticleReal*> m_h_runtime_r_ptrs;
amrex::Gpu::HostVector<int*> m_h_runtime_i_ptrs;

mutable amrex::Gpu::HostVector<const ParticleReal*> m_h_runtime_r_cptrs;
mutable amrex::Gpu::HostVector<const int*> m_h_runtime_i_cptrs;
};

} // namespace amrex
Expand Down

0 comments on commit f692e78

Please sign in to comment.