diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H index 37516ac3ada..ea2c4e526e0 100644 --- a/Src/Particle/AMReX_ParticleContainerI.H +++ b/Src/Particle/AMReX_ParticleContainerI.H @@ -384,30 +384,56 @@ template Vector ParticleContainer::NumberOfParticlesInGrid (int lev, bool only_valid, bool only_local) const { - auto ngrids = ParticleBoxArray(lev).size(); - Vector nparticles(ngrids, 0); + AMREX_ASSERT(lev >= 0 && lev < int(m_particles.size())); - if (lev >= 0 && lev < int(m_particles.size())) { - for (const auto& kv : GetParticles(lev)) { - int gid = kv.first.first; - const auto& ptile = kv.second; + LayoutData np_per_grid_local(ParticleBoxArray(lev), + ParticleDistributionMap(lev)); - if (only_valid) { - for (int k = 0; k < ptile.GetArrayOfStructs().numParticles(); ++k) { - const ParticleType& p = ptile.GetArrayOfStructs()[k]; - if (p.id() > 0) ++nparticles[gid]; - } - } else { - nparticles[gid] += ptile.numParticles(); - } + for (ParConstIterType pti(*this, lev); pti.isValid(); ++pti) + { + int gid = pti.index(); + if (only_valid) + { + const auto& ptile = ParticlesAt(lev, pti); + const auto& aos = ptile.GetArrayOfStructs(); + const auto pstruct = aos().dataPtr(); + const int np = ptile.numParticles(); + + ReduceOps reduce_op; + ReduceData reduce_data(reduce_op); + using ReduceTuple = typename decltype(reduce_data)::Type; + + reduce_op.eval(np, reduce_data, + [=] AMREX_GPU_DEVICE (int i) -> ReduceTuple + { + return (pstruct[i].id() > 0) ? 1 : 0; + }); + + int np_valid = amrex::get<0>(reduce_data.value()); + np_per_grid_local[gid] += np_valid; + } else + { + np_per_grid_local[gid] += pti.numParticles(); + } } - - if (!only_local) { - ParallelAllReduce::Sum(&nparticles[0], ngrids, ParallelContext::CommunicatorSub()); + + Vector nparticles(np_per_grid_local.size(), 0); + if (only_local) + { + for (ParConstIterType pti(*this, lev); pti.isValid(); ++pti) + { + nparticles[pti.index()] = np_per_grid_local[pti.index()]; + } + } + else + { + ParallelDescriptor::GatherLayoutDataToVector(np_per_grid_local, nparticles, + ParallelContext::IOProcessorNumberSub()); + ParallelDescriptor::Bcast(&nparticles[0], nparticles.size(), + ParallelContext::IOProcessorNumberSub()); } - } - return nparticles; + return nparticles; } template