diff --git a/Src/Base/AMReX_GpuAllocators.H b/Src/Base/AMReX_GpuAllocators.H
index 6e8fe639459..6f9377c1857 100644
--- a/Src/Base/AMReX_GpuAllocators.H
+++ b/Src/Base/AMReX_GpuAllocators.H
@@ -166,7 +166,6 @@ namespace amrex {
 
     };
 
-
 #ifdef AMREX_USE_GPU
     template <class T>
     using DefaultAllocator = amrex::ArenaAllocator<T>;
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index a7ed5e9c06d..644d379f705 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -32,7 +32,6 @@ template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 void
 ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt> :: Initialize ()
 {
-    levelDirectoriesCreated = false;
     usePrePost = false;
     doUnlink = true;
 
@@ -43,7 +42,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt> :: Initialize
     {
         static_assert(sizeof(ParticleType)%sizeof(RealType) == 0,
                       "sizeof ParticleType is not a multiple of sizeof RealType");
-        
+
         ParmParse pp("particles");
         pp.query("do_tiling", do_tiling);
         Vector<int> tilesize(AMREX_SPACEDIM);
@@ -54,7 +53,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt> :: Initialize
         static_assert(std::is_standard_layout<ParticleType>::value
                    && std::is_trivial<ParticleType>::value,
                       "Particle type must be standard layout and trivial.");
-        
+
         pp.query("use_prepost", usePrePost);
         pp.query("do_unlink", doUnlink);
 
@@ -909,7 +908,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::
 copyParticles (const ParticleContainerType& other, bool local)
 {
     BL_PROFILE("ParticleContainer::copyParticles");
-    clearParticles();   
+    clearParticles();
     addParticles(other, local);
 }
 
@@ -926,7 +925,7 @@ addParticles (const ParticleContainerType& other, bool local)
         auto& plevel = GetParticles(lev);
         for(MFIter mfi = other.MakeMFIter(lev); mfi.isValid(); ++mfi)
         {
-            auto index = std::make_pair(mfi.index(), mfi.LocalTileIndex());            
+            auto index = std::make_pair(mfi.index(), mfi.LocalTileIndex());
             if(plevel_other.find(index) == plevel_other.end()) continue;
 
             auto& ptile = plevel[index];
@@ -937,10 +936,10 @@ addParticles (const ParticleContainerType& other, bool local)
             auto dst_index = ptile.numParticles();
             ptile.resize(dst_index + np);
 
-            amrex::copyParticles(ptile, ptile_other, 0, dst_index, np);            
+            amrex::copyParticles(ptile, ptile_other, 0, dst_index, np);
         }
     }
-    
+
     if (not local) Redistribute();
 }
 
diff --git a/Src/Particle/AMReX_ParticleHDF5.H b/Src/Particle/AMReX_ParticleHDF5.H
index 528f4fc35a6..359b0c34fb0 100644
--- a/Src/Particle/AMReX_ParticleHDF5.H
+++ b/Src/Particle/AMReX_ParticleHDF5.H
@@ -296,35 +296,31 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 
     std::string pdir = dir;
     if ( not pdir.empty() and pdir[pdir.size()-1] != '/') pdir += '/';
-    
-    if ( ! levelDirectoriesCreated)
-    {
-        if (ParallelDescriptor::IOProcessor()) {
-            if ( ! amrex::UtilCreateDirectory(pdir, 0755)) 
-                amrex::CreateDirectoryFailed(pdir);
-
-            int set_stripe = 0;
-            char setstripe[1024];
-            int stripe_count = 128;
-            int stripe_size = 1;
-            char *stripe_count_str = getenv("HDF5_STRIPE_COUNT");
-            char *stripe_size_str  = getenv("HDF5_STRIPE_SIZE");
-            if (stripe_count_str) { 
-                stripe_count = atoi(stripe_count_str);
-                set_stripe = 1;
-            }
-            if (stripe_size_str) {
-                stripe_size = atoi(stripe_size_str);
-                set_stripe = 1;
-            }
-            if (set_stripe == 1) {
-                sprintf(setstripe, "lfs setstripe -c %d -S %dm %s", stripe_count, stripe_size, pdir.c_str());
-                std::cout << "Setting stripe parameters for HDF5 output: " << setstripe << std::endl; 
-                amrex::ignore_unused(std::system(setstripe));
-            }
+    if (ParallelDescriptor::IOProcessor()) {
+        if ( ! amrex::UtilCreateDirectory(pdir, 0755)) 
+            amrex::CreateDirectoryFailed(pdir);
+
+        int set_stripe = 0;
+        char setstripe[1024];
+        int stripe_count = 128;
+        int stripe_size = 1;
+        char *stripe_count_str = getenv("HDF5_STRIPE_COUNT");
+        char *stripe_size_str  = getenv("HDF5_STRIPE_SIZE");
+        if (stripe_count_str) { 
+            stripe_count = atoi(stripe_count_str);
+            set_stripe = 1;
+        }
+        if (stripe_size_str) {
+            stripe_size = atoi(stripe_size_str);
+            set_stripe = 1;
+        }
+        if (set_stripe == 1) {
+            sprintf(setstripe, "lfs setstripe -c %d -S %dm %s", stripe_count, stripe_size, pdir.c_str());
+            std::cout << "Setting stripe parameters for HDF5 output: " << setstripe << std::endl; 
+            amrex::ignore_unused(std::system(setstripe));
         }
-        ParallelDescriptor::Barrier();
     }
+    ParallelDescriptor::Barrier();
 
     Long nparticles = 0;
     int maxnextid;
diff --git a/Src/Particle/AMReX_ParticleIO.H b/Src/Particle/AMReX_ParticleIO.H
index e26e5f9cd92..c73d6edbbbb 100644
--- a/Src/Particle/AMReX_ParticleIO.H
+++ b/Src/Particle/AMReX_ParticleIO.H
@@ -1,6 +1,8 @@
 #ifndef AMREX_PARTICLEIO_H
 #define AMREX_PARTICLEIO_H
 
+#include <AMReX_WriteBinaryParticleData.H>
+
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 void
 ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
@@ -51,7 +53,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
             tmp_real_comp_names.push_back(real_comp_names[i]);
         }
     }
-    
+
     Vector<int> write_int_comp;
     Vector<std::string> tmp_int_comp_names;
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
@@ -91,7 +93,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         ss << "real_comp" << i;
         real_comp_names.push_back(ss.str());
     }
-    
+
     Vector<int> write_int_comp;
     Vector<std::string> int_comp_names;
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
@@ -101,7 +103,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         ss << "int_comp" << i;
         int_comp_names.push_back(ss.str());
     }
-    
+
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
                             [=] AMREX_GPU_HOST_DEVICE (const SuperParticleType& p) -> int
@@ -124,7 +126,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         ss << "real_comp" << i;
         real_comp_names.push_back(ss.str());
     }
-    
+
     Vector<int> write_int_comp;
     Vector<std::string> int_comp_names;
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
@@ -134,7 +136,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         ss << "int_comp" << i;
         int_comp_names.push_back(ss.str());
     }
-    
+
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
                             [=] AMREX_GPU_HOST_DEVICE (const SuperParticleType& p)
@@ -149,7 +151,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 ::WritePlotFile (const std::string& dir, const std::string& name,
                  const Vector<std::string>& real_comp_names,
                  const Vector<std::string>& int_comp_names) const
-{    
+{
     AMREX_ASSERT(real_comp_names.size() == NStructReal + NumRealComps());
     AMREX_ASSERT( int_comp_names.size() == NStructInt  + NumIntComps() );
 
@@ -173,7 +175,7 @@ void
 ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 ::WritePlotFile (const std::string& dir, const std::string& name,
                  const Vector<std::string>& real_comp_names) const
-{    
+{
     AMREX_ASSERT(real_comp_names.size() == NStructReal + NumRealComps());
 
     Vector<int> write_real_comp;
@@ -189,7 +191,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         ss << "int_comp" << i;
         int_comp_names.push_back(ss.str());
     }
-        
+
     WriteBinaryParticleData(dir, name,
                             write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
@@ -206,10 +208,10 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
                  const std::string& name,
                  const Vector<int>& write_real_comp,
                  const Vector<int>& write_int_comp) const
-{    
+{
     AMREX_ASSERT(write_real_comp.size() == NStructReal + NumRealComps());
     AMREX_ASSERT(write_int_comp.size()  == NStructInt  + NArrayInt );
-    
+
     Vector<std::string> real_comp_names;
     for (int i = 0; i < NStructReal + NumRealComps(); ++i )
     {
@@ -239,12 +241,12 @@ void
 ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::
 WritePlotFile (const std::string& dir, const std::string& name,
                const Vector<int>& write_real_comp,
-               const Vector<int>& write_int_comp,    
+               const Vector<int>& write_int_comp,
                const Vector<std::string>& real_comp_names,
                const Vector<std::string>&  int_comp_names) const
 {
     BL_PROFILE("ParticleContainer::WritePlotFile()");
-    
+
     WriteBinaryParticleData(dir, name,
                             write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
@@ -269,7 +271,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         ss << "real_comp" << i;
         real_comp_names.push_back(ss.str());
     }
-    
+
     Vector<int> write_int_comp;
     Vector<std::string> int_comp_names;
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
@@ -279,7 +281,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         ss << "int_comp" << i;
         int_comp_names.push_back(ss.str());
     }
-    
+
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
                             std::forward<F>(f));
@@ -292,7 +294,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 ::WritePlotFile (const std::string& dir, const std::string& name,
                  const Vector<std::string>& real_comp_names,
                  const Vector<std::string>& int_comp_names, F&& f) const
-{    
+{
     AMREX_ASSERT(real_comp_names.size() == NStructReal + NumRealComps());
     AMREX_ASSERT( int_comp_names.size() == NStructInt  + NArrayInt );
 
@@ -314,7 +316,7 @@ void
 ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 ::WritePlotFile (const std::string& dir, const std::string& name,
                  const Vector<std::string>& real_comp_names, F&& f) const
-{    
+{
     AMREX_ASSERT(real_comp_names.size() == NStructReal + NumRealComps());
 
     Vector<int> write_real_comp;
@@ -330,7 +332,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         ss << "int_comp" << i;
         int_comp_names.push_back(ss.str());
     }
-        
+
     WriteBinaryParticleData(dir, name,
                             write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
@@ -345,10 +347,10 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
                  const std::string& name,
                  const Vector<int>& write_real_comp,
                  const Vector<int>& write_int_comp, F&& f) const
-{    
+{
     AMREX_ASSERT(write_real_comp.size() == NStructReal + NumRealComps());
     AMREX_ASSERT(write_int_comp.size()  == NStructInt  + NumIntComps() );
-    
+
     Vector<std::string> real_comp_names;
     for (int i = 0; i < NStructReal + NumRealComps(); ++i )
     {
@@ -376,13 +378,13 @@ void
 ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::
 WritePlotFile (const std::string& dir, const std::string& name,
                const Vector<int>& write_real_comp,
-               const Vector<int>& write_int_comp,    
+               const Vector<int>& write_int_comp,
                const Vector<std::string>& real_comp_names,
                const Vector<std::string>&  int_comp_names,
                F&& f) const
 {
     BL_PROFILE("ParticleContainer::WritePlotFile()");
-    
+
     WriteBinaryParticleData(dir, name,
                             write_real_comp, write_int_comp,
                             real_comp_names, int_comp_names,
@@ -400,302 +402,19 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
                            const Vector<std::string>& int_comp_names,
                            F&& f) const
 {
-    BL_PROFILE("ParticleContainer::WriteBinaryParticleData()");
-    AMREX_ASSERT(OK());
-    
-    AMREX_ASSERT(sizeof(typename ParticleType::RealType) == 4 ||
-                 sizeof(typename ParticleType::RealType) == 8);
-    
-    const int NProcs = ParallelDescriptor::NProcs();
-    const int IOProcNumber = ParallelDescriptor::IOProcessorNumber();
-    const Real strttime = amrex::second();
-    
-    AMREX_ALWAYS_ASSERT(real_comp_names.size() == NumRealComps() + NStructReal);
-    AMREX_ALWAYS_ASSERT( int_comp_names.size() == NumIntComps() + NStructInt);
-
-    std::string pdir = dir;
-    if ( not pdir.empty() and pdir[pdir.size()-1] != '/') pdir += '/';
-    pdir += name;
-    
-    if ( ! levelDirectoriesCreated)
-    {
-        if (ParallelDescriptor::IOProcessor()) 
-            if ( ! amrex::UtilCreateDirectory(pdir, 0755)) 
-                amrex::CreateDirectoryFailed(pdir);
-        ParallelDescriptor::Barrier();
-    }
-    
-    std::ofstream HdrFile;
-    
-    Long nparticles = 0;
-    int maxnextid;
-
-    // evaluate f for every particle to determine which ones to output
-    Vector<std::map<std::pair<int, int>, Gpu::DeviceVector<int> > > particle_io_flags(m_particles.size());		
-    for (int lev = 0; lev < m_particles.size();  lev++)
-    {
-        const auto& pmap = m_particles[lev];
-        for (const auto& kv : pmap)
-        {
-            const auto ptd = kv.second.getConstParticleTileData();
-            const auto np = kv.second.numParticles();
-            particle_io_flags[lev][kv.first].resize(np, 0);
-            auto pflags = particle_io_flags[lev][kv.first].data();
-            AMREX_HOST_DEVICE_FOR_1D( np, k, 
-            {
-                const auto p = ptd.getSuperParticle(k);
-                pflags[k] = f(p);
-            });
-        }
-    }
-
-    Gpu::Device::synchronize();
-    
-    if(usePrePost)
-    {
-        nparticles = nparticlesPrePost;
-        maxnextid  = maxnextidPrePost;
-    }
-    else
-    {
-        nparticles = 0;
-        maxnextid  = ParticleType::NextID();
-
-        for (int lev = 0; lev < m_particles.size();  lev++)
-        {
-            const auto& pmap = m_particles[lev];
-            for (const auto& kv : pmap)
-            {
-                const auto& pflags = particle_io_flags[lev][kv.first];
-                for (int k = 0; k < kv.second.numParticles(); ++k) 
-                {
-                    if (pflags[k]) nparticles++;
-                }
-            }
-        }
-        
-        ParallelDescriptor::ReduceLongSum(nparticles, IOProcNumber);
-        ParticleType::NextID(maxnextid);
-        ParallelDescriptor::ReduceIntMax(maxnextid, IOProcNumber);
-    }
-
-    if (ParallelDescriptor::IOProcessor())
-    {
-        std::string HdrFileName = pdir;
-	
-        if ( ! HdrFileName.empty() && HdrFileName[HdrFileName.size()-1] != '/')
-            HdrFileName += '/';
-        
-        HdrFileName += "Header";
-        HdrFileNamePrePost = HdrFileName;
-	
-        HdrFile.open(HdrFileName.c_str(), std::ios::out|std::ios::trunc);
-	
-        if ( ! HdrFile.good()) amrex::FileOpenFailed(HdrFileName);
-
-        //
-        // First thing written is our Checkpoint/Restart version string.
-        // We append "_single" or "_double" to the version string indicating
-        // whether we're using "float" or "double" floating point data in the
-        // particles so that we can Restart from the checkpoint files.
-        //
-        if (sizeof(typename ParticleType::RealType) == 4)
-        {
-            HdrFile << ParticleType::Version() << "_single" << '\n';
-        }
-        else
-        {
-            HdrFile << ParticleType::Version() << "_double" << '\n';
-        }
-
-        int num_output_real = 0;
-        for (int i = 0; i < NumRealComps() + NStructReal; ++i)
-            if (write_real_comp[i]) ++num_output_real;
-        
-        int num_output_int = 0;
-        for (int i = 0; i < NumIntComps() + NStructInt; ++i)
-            if (write_int_comp[i]) ++num_output_int;
-        
-        // AMREX_SPACEDIM and N for sanity checking.
-        HdrFile << AMREX_SPACEDIM << '\n';
-	
-        // The number of extra real parameters
-        HdrFile << num_output_real << '\n';
-        
-        // Real component names
-        for (int i = 0; i < NStructReal + NumRealComps(); ++i )
-            if (write_real_comp[i]) HdrFile << real_comp_names[i] << '\n';
-        
-        // The number of extra int parameters
-        HdrFile << num_output_int << '\n';
-        
-        // int component names
-        for (int i = 0; i < NStructInt + NumIntComps(); ++i )
-            if (write_int_comp[i]) HdrFile << int_comp_names[i] << '\n';
-
-        bool is_checkpoint = true; // legacy
-        HdrFile << is_checkpoint << '\n';
-
-        // The total number of particles.
-        HdrFile << nparticles << '\n';
-
-        // The value of nextid that we need to restore on restart.
-        HdrFile << maxnextid << '\n';
-
-        // Then the finest level of the AMR hierarchy.
-        HdrFile << finestLevel() << '\n';
-
-        // Then the number of grids at each level.
-        for (int lev = 0; lev <= finestLevel(); lev++)
-            HdrFile << ParticleBoxArray(lev).size() << '\n';
-    }
-
-    // We want to write the data out in parallel.
-    // We'll allow up to nOutFiles active writers at a time.
-    int nOutFiles(256);
-
-    ParmParse pp("particles");
-    pp.query("particles_nfiles",nOutFiles);
-    if(nOutFiles == -1) nOutFiles = NProcs;
-    nOutFiles = std::max(1, std::min(nOutFiles,NProcs));
-    nOutFilesPrePost = nOutFiles;
-
-    for (int lev = 0; lev <= finestLevel(); lev++)
-    {
-        bool gotsome;
-        if(usePrePost)
-        {
-            gotsome = (nParticlesAtLevelPrePost[lev] > 0);
-        }
-        else
-        {
-            gotsome = (NumberOfParticlesAtLevel(lev) > 0);
-        }
-
-        // We store the particles at each level in their own subdirectory.
-        std::string LevelDir = pdir;
-        
-        if (gotsome)
-        {
-            if ( ! LevelDir.empty() && LevelDir[LevelDir.size()-1] != '/') LevelDir += '/';
-	    
-            LevelDir = amrex::Concatenate(LevelDir + "Level_", lev, 1);
-            
-            if ( ! levelDirectoriesCreated) {
-                if (ParallelDescriptor::IOProcessor()) 
-                    if ( ! amrex::UtilCreateDirectory(LevelDir, 0755)) 
-                        amrex::CreateDirectoryFailed(LevelDir);
-                //
-                // Force other processors to wait until directory is built.
-                //
-                ParallelDescriptor::Barrier();
-            }
-        }
-	
-        // Write out the header for each particle
-        if (gotsome and ParallelDescriptor::IOProcessor()) {
-            std::string HeaderFileName = LevelDir;
-            HeaderFileName += "/Particle_H";
-            std::ofstream ParticleHeader(HeaderFileName);
-            
-            ParticleBoxArray(lev).writeOn(ParticleHeader);
-            ParticleHeader << '\n';
-            
-            ParticleHeader.flush();
-            ParticleHeader.close();
-        }
-        
-        MFInfo info;
-        info.SetAlloc(false);
-        MultiFab state(ParticleBoxArray(lev),
-                       ParticleDistributionMap(lev),
-                       1,0,info);
-        
-        // We eventually want to write out the file name and the offset
-        // into that file into which each grid of particles is written.
-        Vector<int>  which(state.size(),0);
-        Vector<int > count(state.size(),0);
-        Vector<Long> where(state.size(),0);
-	
-        std::string filePrefix(LevelDir);
-        filePrefix += '/';
-        filePrefix += ParticleType::DataPrefix();
-        if(usePrePost) {
-            filePrefixPrePost[lev] = filePrefix;
-        }
-        bool groupSets(false), setBuf(true);
-        
-        if (gotsome)
-        {
-            for(NFilesIter nfi(nOutFiles, filePrefix, groupSets, setBuf); nfi.ReadyToWrite(); ++nfi)
-            {
-                std::ofstream& myStream = (std::ofstream&) nfi.Stream();
-                WriteParticles(lev, myStream, nfi.FileNumber(), which, count, where,
-                               write_real_comp, write_int_comp, particle_io_flags);
-            }
-            
-            if(usePrePost) {
-                whichPrePost[lev] = which;
-                countPrePost[lev] = count;
-                wherePrePost[lev] = where;
-            } else {
-                ParallelDescriptor::ReduceIntSum (which.dataPtr(), which.size(), IOProcNumber);
-                ParallelDescriptor::ReduceIntSum (count.dataPtr(), count.size(), IOProcNumber);
-                ParallelDescriptor::ReduceLongSum(where.dataPtr(), where.size(), IOProcNumber);
-            }
-        }
-	
-        if (ParallelDescriptor::IOProcessor())
-        {
-            if(usePrePost) {
-                // ---- write to the header and unlink in CheckpointPost
-            } else {
-                for (int j = 0; j < state.size(); j++)
-                {
-                    HdrFile << which[j] << ' ' << count[j] << ' ' << where[j] << '\n';
-                }
-				
-                if (gotsome && doUnlink)
-                {
-                    // Unlink any zero-length data files.
-                    Vector<Long> cnt(nOutFiles,0);
-					
-                    for (int i = 0, N=count.size(); i < N; i++) {
-                        cnt[which[i]] += count[i];
-                    }
-                    
-                    for (int i = 0, N=cnt.size(); i < N; i++)
-                    {
-                        if (cnt[i] == 0)
-                        {
-                            std::string FullFileName = NFilesIter::FileName(i, filePrefix);
-                            FileSystem::Remove(FullFileName);
-                        }
-                    }
-                }                
-            }            
-        }
-    }
-    
-    if (ParallelDescriptor::IOProcessor())
+    if (AsyncOut::UseAsyncOut()) {
+        WriteBinaryParticleDataAsync(*this, dir, name,
+                                     write_real_comp, write_int_comp,
+                                     real_comp_names, int_comp_names);
+    } else
     {
-        HdrFile.flush();
-        HdrFile.close();
-        if ( ! HdrFile.good())
-        {
-            amrex::Abort("ParticleContainer::Checkpoint(): problem writing HdrFile");
-        }
-    }
-    
-    if (m_verbose > 1)
-    {
-        Real stoptime = amrex::second() - strttime;        
-        ParallelDescriptor::ReduceRealMax(stoptime, IOProcNumber);        
-        amrex::Print() << "ParticleContainer::Checkpoint() time: " << stoptime << '\n';
+        WriteBinaryParticleDataSync(*this, dir, name,
+                                    write_real_comp, write_int_comp,
+                                    real_comp_names, int_comp_names,
+                                    std::forward<F>(f));
     }
 }
 
-
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 void
 ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
@@ -704,13 +423,13 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
     if( ! usePrePost) {
         return;
     }
-    
+
     BL_PROFILE("ParticleContainer::CheckpointPre()");
-    
+
     const int IOProcNumber = ParallelDescriptor::IOProcessorNumber();
     Long nparticles = 0;
     int  maxnextid  = ParticleType::NextID();
-    
+
     for (int lev = 0; lev < m_particles.size();  lev++) {
         const auto& pmap = m_particles[lev];
         for (const auto& kv : pmap) {
@@ -730,23 +449,23 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 
     ParticleType::NextID(maxnextid);
     ParallelDescriptor::ReduceIntMax(maxnextid, IOProcNumber);
-    
+
     nparticlesPrePost = nparticles;
     maxnextidPrePost  = maxnextid;
-    
+
     nParticlesAtLevelPrePost.clear();
     nParticlesAtLevelPrePost.resize(finestLevel() + 1, 0);
     for(int lev(0); lev <= finestLevel(); ++lev) {
         nParticlesAtLevelPrePost[lev] = NumberOfParticlesAtLevel(lev);
     }
-    
+
     whichPrePost.clear();
     whichPrePost.resize(finestLevel() + 1);
     countPrePost.clear();
     countPrePost.resize(finestLevel() + 1);
     wherePrePost.clear();
     wherePrePost.resize(finestLevel() + 1);
-    
+
     filePrefixPrePost.clear();
     filePrefixPrePost.resize(finestLevel() + 1);
 }
@@ -760,34 +479,34 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
     if( ! usePrePost) {
         return;
     }
-    
+
     BL_PROFILE("ParticleContainer::CheckpointPost()");
-    
+
     const int IOProcNumber = ParallelDescriptor::IOProcessorNumber();
     std::ofstream HdrFile;
     HdrFile.open(HdrFileNamePrePost.c_str(), std::ios::out | std::ios::app);
-    
+
     for(int lev(0); lev <= finestLevel(); ++lev) {
         ParallelDescriptor::ReduceIntSum (whichPrePost[lev].dataPtr(), whichPrePost[lev].size(), IOProcNumber);
         ParallelDescriptor::ReduceIntSum (countPrePost[lev].dataPtr(), countPrePost[lev].size(), IOProcNumber);
         ParallelDescriptor::ReduceLongSum(wherePrePost[lev].dataPtr(), wherePrePost[lev].size(), IOProcNumber);
-        
-        
+
+
         if(ParallelDescriptor::IOProcessor()) {
             for(int j(0); j < whichPrePost[lev].size(); ++j) {
                 HdrFile << whichPrePost[lev][j] << ' ' << countPrePost[lev][j] << ' ' << wherePrePost[lev][j] << '\n';
             }
-            
+
             const bool gotsome = (nParticlesAtLevelPrePost[lev] > 0);
             if(gotsome && doUnlink) {
 //            BL_PROFILE_VAR("PC<NNNN>::Checkpoint:unlink", unlink_post);
                 // Unlink any zero-length data files.
                 Vector<Long> cnt(nOutFilesPrePost,0);
-                
+
                 for(int i(0), N = countPrePost[lev].size(); i < N; ++i) {
                     cnt[whichPrePost[lev][i]] += countPrePost[lev][i];
                 }
-                
+
                 for(int i(0), N = cnt.size(); i < N; ++i) {
                     if(cnt[i] == 0) {
                         std::string FullFileName = NFilesIter::FileName(i, filePrefixPrePost[lev]);
@@ -797,7 +516,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
             }
         }
     }
-    
+
     if(ParallelDescriptor::IOProcessor()) {
         HdrFile.flush();
         HdrFile.close();
@@ -845,39 +564,39 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         const int tile = kv.first.second;
         tile_map[grid].push_back(tile);
         const auto& pflags = particle_io_flags[lev].at(kv.first);
-		
+
         // Only write out valid particles.
-        int cnt = 0;	
+        int cnt = 0;
         for (int k = 0; k < kv.second.GetArrayOfStructs().numParticles(); ++k)
         {
             if (pflags[k]) cnt++;
         }
-		
+
         count[grid] += cnt;
     }
-    
+
     MFInfo info;
     info.SetAlloc(false);
     MultiFab state(ParticleBoxArray(lev), ParticleDistributionMap(lev), 1,0,info);
-    
+
     for (MFIter mfi(state); mfi.isValid(); ++mfi)
     {
         const int grid = mfi.index();
-        
+
         which[grid] = fnum;
         where[grid] = VisMF::FileOffset(ofs);
-        
+
         if (count[grid] == 0) continue;
-      
+
         // First write out the integer data in binary.
         int num_output_int = 0;
         for (int i = 0; i < NumIntComps() + NStructInt; ++i)
             if (write_int_comp[i]) ++num_output_int;
-        
+
         const int iChunkSize = 2 + num_output_int;
         Vector<int> istuff(count[grid]*iChunkSize);
         int* iptr = istuff.dataPtr();
-        
+
         for (unsigned i = 0; i < tile_map[grid].size(); i++) {
             auto ptile_index = std::make_pair(grid, tile_map[grid][i]);
             const auto& pbox = m_particles[lev].at(ptile_index);
@@ -913,19 +632,19 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
                 }
             }
         }
-                
+
         writeIntData(istuff.dataPtr(), istuff.size(), ofs);
         ofs.flush();  // Some systems require this flush() (probably due to a bug)
-        
+
         // Write the Real data in binary.
         int num_output_real = 0;
         for (int i = 0; i < NumRealComps() + NStructReal; ++i)
             if (write_real_comp[i]) ++num_output_real;
-        
+
         const int rChunkSize = AMREX_SPACEDIM + num_output_real;
         Vector<typename ParticleType::RealType> rstuff(count[grid]*rChunkSize);
         typename ParticleType::RealType* rptr = rstuff.dataPtr();
-        
+
         for (unsigned i = 0; i < tile_map[grid].size(); i++) {
 			auto ptile_index = std::make_pair(grid, tile_map[grid][i]);
             const auto& pbox = m_particles[lev].at(ptile_index);
@@ -984,9 +703,9 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
     BL_PROFILE("ParticleContainer::Restart()");
     AMREX_ASSERT(!dir.empty());
     AMREX_ASSERT(!file.empty());
-    
+
     const Real strttime = amrex::second();
-    
+
     int DATA_Digits_Read(5);
     ParmParse pp("particles");
     pp.query("datadigits_read",DATA_Digits_Read);
@@ -999,16 +718,16 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
     if (!HdrFileName.empty() && HdrFileName[HdrFileName.size()-1] != '/')
         HdrFileName += '/';
     HdrFileName += "Header";
-    
+
     Vector<char> fileCharPtr;
     ParallelDescriptor::ReadAndBcastFile(HdrFileName, fileCharPtr);
     std::string fileCharPtrString(fileCharPtr.dataPtr());
     std::istringstream HdrFile(fileCharPtrString, std::istringstream::in);
-  
+
     std::string version;
     HdrFile >> version;
     AMREX_ASSERT(!version.empty());
-    
+
     // What do our version strings mean?
     // "Version_One_Dot_Zero" -- hard-wired to write out in double precision.
     // "Version_One_Dot_One" -- can write out either as either single or double precision.
@@ -1038,45 +757,45 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         msg += version;
         amrex::Abort(msg.c_str());
     }
-    
+
     int dm;
     HdrFile >> dm;
     if (dm != AMREX_SPACEDIM)
         amrex::Abort("ParticleContainer::Restart(): dm != AMREX_SPACEDIM");
-    
+
     int nr;
     HdrFile >> nr;
     if (nr != NStructReal + NumRealComps())
         amrex::Abort("ParticleContainer::Restart(): nr != NStructReal + NumRealComps()");
-    
+
     std::string comp_name;
     for (int i = 0; i < nr; ++i)
         HdrFile >> comp_name;
-    
+
     int ni;
     HdrFile >> ni;
     if (ni != NStructInt + NumIntComps())
         amrex::Abort("ParticleContainer::Restart(): ni != NStructInt");
-    
+
     for (int i = 0; i < ni; ++i)
         HdrFile >> comp_name;
-    
+
     bool checkpoint;
     HdrFile >> checkpoint;
-    
+
     Long nparticles;
     HdrFile >> nparticles;
     AMREX_ASSERT(nparticles >= 0);
-    
+
     int maxnextid;
     HdrFile >> maxnextid;
     AMREX_ASSERT(maxnextid > 0);
     ParticleType::NextID(maxnextid);
-    
+
     int finest_level_in_file;
     HdrFile >> finest_level_in_file;
     AMREX_ASSERT(finest_level_in_file >= 0);
-    
+
     // Determine whether this is a dual-grid restart or not.
     Vector<BoxArray> particle_box_arrays(finest_level_in_file + 1);
     bool dual_grid = false;
@@ -1091,7 +810,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         if (amrex::FileExists(phdr_name)) {
             have_pheaders = true;
             break;
-        }        
+        }
     }
 
     if (have_pheaders)
@@ -1103,18 +822,18 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
             phdr_name += "/Particle_H";
 
             if (not amrex::FileExists(phdr_name)) continue;
-            
+
             Vector<char> phdr_chars;
             ParallelDescriptor::ReadAndBcastFile(phdr_name, phdr_chars);
             std::string phdr_string(phdr_chars.dataPtr());
             std::istringstream phdr_file(phdr_string, std::istringstream::in);
-            
+
             if (lev > finestLevel())
             {
                 dual_grid = true;
                 break;
             }
-        
+
             particle_box_arrays[lev].readFrom(phdr_file);
             if (not particle_box_arrays[lev].CellEqual(ParticleBoxArray(lev))) dual_grid = true;
         }
@@ -1130,7 +849,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
             SetParticleDistributionMap(lev, pdm);
         }
     }
-    
+
     Vector<int> ngrids(finest_level_in_file+1);
     for (int lev = 0; lev <= finest_level_in_file; lev++) {
         HdrFile >> ngrids[lev];
@@ -1139,13 +858,13 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
             AMREX_ASSERT(ngrids[lev] == int(ParticleBoxArray(lev).size()));
         }
     }
-    
+
     resizeData();
-    
+
     if (finest_level_in_file > finestLevel()) {
         m_particles.resize(finest_level_in_file+1);
     }
-    
+
     for (int lev = 0; lev <= finest_level_in_file; lev++) {
         Vector<int>  which(ngrids[lev]);
         Vector<int>  count(ngrids[lev]);
@@ -1153,65 +872,65 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         for (int i = 0; i < ngrids[lev]; i++) {
             HdrFile >> which[i] >> count[i] >> where[i];
         }
-        
+
         Vector<int> grids_to_read;
         if (lev <= finestLevel()) {
             for (MFIter mfi(*m_dummy_mf[lev]); mfi.isValid(); ++mfi) {
                 grids_to_read.push_back(mfi.index());
             }
         } else {
-            
+
             // we lost a level on restart. we still need to read in particles
             // on finer levels, and put them in the right place via Redistribute()
-            
+
             const int rank = ParallelDescriptor::MyProc();
             const int NReaders = ParticleType::MaxReaders();
             if (rank >= NReaders) return;
-            
+
             const int Navg = ngrids[lev] / NReaders;
             const int Nleft = ngrids[lev] - Navg * NReaders;
-            
+
             int lo, hi;
             if (rank < Nleft) {
                 lo = rank*(Navg + 1);
                 hi = lo + Navg + 1;
-            } 
+            }
             else {
                 lo = rank * Navg + Nleft;
                 hi = lo + Navg;
             }
-            
+
             for (int i = lo; i < hi; ++i) {
                 grids_to_read.push_back(i);
             }
         }
-        
+
         for(int igrid = 0; igrid < static_cast<int>(grids_to_read.size()); ++igrid) {
             const int grid = grids_to_read[igrid];
-            
+
             if (count[grid] <= 0) continue;
-            
+
             // The file names in the header file are relative.
             std::string name = fullname;
-            
+
             if (!name.empty() && name[name.size()-1] != '/')
                 name += '/';
-            
+
             name += "Level_";
             name += amrex::Concatenate("", lev, 1);
             name += '/';
             name += ParticleType::DataPrefix();
             name += amrex::Concatenate("", which[grid], DATA_Digits_Read);
-            
+
             std::ifstream ParticleFile;
-            
+
             ParticleFile.open(name.c_str(), std::ios::in | std::ios::binary);
-            
+
             if (!ParticleFile.good())
                 amrex::FileOpenFailed(name);
-            
+
             ParticleFile.seekg(where[grid], std::ios::beg);
-            
+
             if (how == "single") {
                 ReadParticles<float>(count[grid], grid, lev, ParticleFile, finest_level_in_file);
             }
@@ -1223,20 +942,20 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
                 msg += how;
                 amrex::Error(msg.c_str());
             }
-            
+
             ParticleFile.close();
-            
+
             if (!ParticleFile.good())
                 amrex::Abort("ParticleContainer::Restart(): problem reading particles");
         }
     }
-    
+
     Redistribute();
-    
+
     AMREX_ASSERT(OK());
-    
+
     if (m_verbose > 1) {
-        Real stoptime = amrex::second() - strttime;	
+        Real stoptime = amrex::second() - strttime;
         ParallelDescriptor::ReduceRealMax(stoptime, ParallelDescriptor::IOProcessorNumber());
         amrex::Print() << "ParticleContainer::Restart() time: " << stoptime << '\n';
     }
@@ -1259,16 +978,16 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
     const int iChunkSize = 2 + NStructInt + NumIntComps();
     Vector<int> istuff(cnt*iChunkSize);
     readIntData(istuff.dataPtr(), istuff.size(), ifs, FPC::NativeIntDescriptor());
-    
+
     // Then the real data in binary.
     const int rChunkSize = AMREX_SPACEDIM + NStructReal + NumRealComps();
     Vector<RTYPE> rstuff(cnt*rChunkSize);
     ReadParticleRealData(rstuff.dataPtr(), rstuff.size(), ifs);
-    
+
     // Now reassemble the particles.
     int*   iptr = istuff.dataPtr();
     RTYPE* rptr = rstuff.dataPtr();
-    
+
     ParticleType p;
     ParticleLocData pld;
 
@@ -1318,7 +1037,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 
         host_real_attribs[lev][ind].resize(NumRealComps());
         host_int_attribs[lev][ind].resize(NumIntComps());
-        
+
 	// add the struct
 	host_particles[lev][ind].push_back(p);
 
@@ -1327,12 +1046,12 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
             host_real_attribs[lev][ind][icomp].push_back(*rptr);
             ++rptr;
 	}
-        
+
 	// ... and int array data
 	for (int icomp = 0; icomp < NumIntComps(); icomp++) {
             host_int_attribs[lev][ind][icomp].push_back(*iptr);
             ++iptr;
-	}        
+	}
     }
 
     for (int host_lev = 0; host_lev < static_cast<int>(host_particles.size()); ++host_lev)
@@ -1341,22 +1060,22 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 	  auto grid = kv.first.first;
 	  auto tile = kv.first.second;
 	  const auto& src_tile = kv.second;
-          
+
 	  auto& dst_tile = DefineAndReturnParticleTile(host_lev, grid, tile);
 	  auto old_size = dst_tile.GetArrayOfStructs().size();
 	  auto new_size = old_size + src_tile.size();
 	  dst_tile.resize(new_size);
-                
+
 	  Gpu::copy(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
                     dst_tile.GetArrayOfStructs().begin() + old_size);
-	  
+
 	  for (int i = 0; i < NumRealComps(); ++i) {
               Gpu::copy(Gpu::hostToDevice,
                         host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
                         host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
                         dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
 	  }
-	  
+
 	  for (int i = 0; i < NumIntComps(); ++i) {
               Gpu::copy(Gpu::hostToDevice,
                         host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
@@ -1365,7 +1084,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 	  }
 	}
       }
-    
+
     Gpu::streamSynchronize();
 }
 
@@ -1399,7 +1118,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::WriteAsciiFil
             }
         }
     }
-    
+
     //
     // And send count to I/O processor.
     //
@@ -1422,7 +1141,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::WriteAsciiFil
         File << NStructInt  << '\n';
         File << NumRealComps()  << '\n';
         File << NumIntComps()   << '\n';
-            
+
         File.flush();
 
         File.close();
diff --git a/Src/Particle/AMReX_ParticleReduce.H b/Src/Particle/AMReX_ParticleReduce.H
index 0962746755a..f7b26e04b58 100644
--- a/Src/Particle/AMReX_ParticleReduce.H
+++ b/Src/Particle/AMReX_ParticleReduce.H
@@ -16,7 +16,7 @@ namespace amrex
 /**
  * \brief A general reduction method for the particles in a ParticleContainer that can run on either CPUs or GPUs.
  * This version operates over all particles on all levels.
- * 
+ *
  * This version uses "Sum" as the reduction operation. The quantity reduced over is an arbitrary function
  * of a "superparticle", which contains all the data in the particle type, whether it is stored in AoS or
  * SoA form.
@@ -25,12 +25,12 @@ namespace amrex
  * call the MPI reduction operations described in ParallelDescriptor if they want that behavior.
  *
  * \tparam PC the ParticleContainer type
- * \tparam F a function object 
+ * \tparam F a function object
  *
- * \param pc the ParticleContainer to operate on 
+ * \param pc the ParticleContainer to operate on
  * \param f a function that takes a "superparticle" and returns the value to be reduced over all particles.
  *
- */    
+ */
 template <class PC, class F, EnableIf_t<IsParticleContainer<PC>::value, int> foo = 0>
 auto
 ReduceSum (PC const& pc, F&& f) -> decltype(f(typename PC::SuperParticleType()))
diff --git a/Src/Particle/AMReX_ParticleTransformation.H b/Src/Particle/AMReX_ParticleTransformation.H
index 7f44453c9b0..efeb36da842 100644
--- a/Src/Particle/AMReX_ParticleTransformation.H
+++ b/Src/Particle/AMReX_ParticleTransformation.H
@@ -11,7 +11,7 @@ namespace amrex
 {
 
 /**
- * \brief A general single particle copying routine that can run on the GPU. 
+ * \brief A general single particle copying routine that can run on the GPU.
  *
  * \tparam NSR number of extra reals in the particle struct
  * \tparam NSI number of extra ints in the particle struct
@@ -26,8 +26,8 @@ namespace amrex
  */
 template <int NSR, int NSI, int NAR, int NAI>
 AMREX_GPU_HOST_DEVICE AMREX_INLINE
-void copyParticle (const      ParticleTileData<NSR, NSI, NAR, NAI>& dst, 
-                   const ConstParticleTileData<NSR, NSI, NAR, NAI>& src, 
+void copyParticle (const      ParticleTileData<NSR, NSI, NAR, NAI>& dst,
+                   const ConstParticleTileData<NSR, NSI, NAR, NAI>& src,
                    int src_i, int dst_i) noexcept
 {
     AMREX_ASSERT(dst.m_num_runtime_real == src.m_num_runtime_real);
@@ -45,7 +45,7 @@ void copyParticle (const      ParticleTileData<NSR, NSI, NAR, NAI>& dst,
 }
 
 /**
- * \brief A general single particle copying routine that can run on the GPU. 
+ * \brief A general single particle copying routine that can run on the GPU.
  *
  * \tparam NSR number of extra reals in the particle struct
  * \tparam NSI number of extra ints in the particle struct
@@ -60,8 +60,8 @@ void copyParticle (const      ParticleTileData<NSR, NSI, NAR, NAI>& dst,
  */
 template <int NSR, int NSI, int NAR, int NAI>
 AMREX_GPU_HOST_DEVICE AMREX_INLINE
-void copyParticle (const ParticleTileData<NSR, NSI, NAR, NAI>& dst, 
-                   const ParticleTileData<NSR, NSI, NAR, NAI>& src, 
+void copyParticle (const ParticleTileData<NSR, NSI, NAR, NAI>& dst,
+                   const ParticleTileData<NSR, NSI, NAR, NAI>& src,
                    int src_i, int dst_i) noexcept
 {
     AMREX_ASSERT(dst.m_num_runtime_real == src.m_num_runtime_real);
@@ -77,9 +77,9 @@ void copyParticle (const ParticleTileData<NSR, NSI, NAR, NAI>& dst,
     for (int j = 0; j < dst.m_num_runtime_int; ++j)
         dst.m_runtime_idata[j][dst_i] = src.m_runtime_idata[j][src_i];
 }
-    
+
 /**
- * \brief A general single particle swapping routine that can run on the GPU. 
+ * \brief A general single particle swapping routine that can run on the GPU.
  *
  * \tparam NSR number of extra reals in the particle struct
  * \tparam NSI number of extra ints in the particle struct
@@ -111,20 +111,21 @@ void swapParticle (const ParticleTileData<NSR, NSI, NAR, NAI>& dst,
     for (int j = 0; j < dst.m_num_runtime_int; ++j)
         amrex::Swap(dst.m_runtime_idata[j][dst_i], src.m_runtime_idata[j][src_i]);
 }
-    
+
 /**
- * \brief Copy particles from src to dst. This version copies all the 
+ * \brief Copy particles from src to dst. This version copies all the
  * particles, writing them to the beginning of dst.
- * 
- * \tparam PTile the particle tile type
+ *
+ * \tparam DstTile the dst particle tile type
+ * \tparam SrcTile the src particle tile type
  *
  * \param dst the destination tile
  * \param src the source tile
  * \param f the function that will be applied to each particle
  *
- */    
-template <typename PTile>
-void copyParticles (PTile& dst, const PTile& src) noexcept
+ */
+template <typename DstTile, typename SrcTile>
+void copyParticles (DstTile& dst, const SrcTile& src) noexcept
 {
     auto np = src.numParticles();
     copyParticles(dst, src, 0, 0, np);
@@ -133,8 +134,9 @@ void copyParticles (PTile& dst, const PTile& src) noexcept
 /**
  * \brief Copy particles from src to dst. This version copies n particles
  * starting at index src_start, writing the result starting at dst_start.
- * 
- * \tparam PTile the particle tile type
+ *
+ * \tparam DstTile the dst particle tile type
+ * \tparam SrcTile the src particle tile type
  * \tparam Index the index type, e.g. unsigned int
  * \tparam N the size type, e.g. Long
  *
@@ -144,9 +146,9 @@ void copyParticles (PTile& dst, const PTile& src) noexcept
  * \param dst_start the offset at which to start writing particles to dst
  *
  */
-template <typename PTile, typename Index, typename N,
+template <typename DstTile, typename SrcTile, typename Index, typename N,
           amrex::EnableIf_t<std::is_integral<Index>::value, int> foo = 0>
-void copyParticles (PTile& dst, const PTile& src,
+void copyParticles (DstTile& dst, const SrcTile& src,
                     Index src_start, Index dst_start, N n) noexcept
 {
     const auto src_data = src.getConstParticleTileData();
@@ -157,11 +159,11 @@ void copyParticles (PTile& dst, const PTile& src,
         copyParticle(dst_data, src_data, src_start+i, dst_start+i);
     });
 }
-    
+
 /**
  * \brief Apply the function f to all the particles in src, writing the
  * result to dst. This version does all the particles in src.
- * 
+ *
  * \tparam DstTile the dst particle tile type
  * \tparam SrcTile the src particle tile type
  * \tparam F a function object
@@ -272,7 +274,8 @@ void transformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& src,
 /**
  * \brief Conditionally copy particles from src to dst based on the value of mask.
  * 
- * \tparam PTile the particle tile type
+ * \tparam DstTile the dst particle tile type
+ * \tparam SrcTile the src particle tile type
  * \tparam Index the index type, e.g. unsigned int
  *
  * \param dst the destination tile
@@ -280,9 +283,9 @@ void transformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& src,
  * \param mask pointer to the mask - 1 means copy, 0 means don't copy
  *
  */
-template <typename PTile, typename Index,
+template <typename DstTile, typename SrcTile, typename Index,
           amrex::EnableIf_t<std::is_integral<Index>::value, int> foo = 0>
-Index filterParticles (PTile& dst, const PTile& src, const Index* mask) noexcept
+Index filterParticles (DstTile& dst, const SrcTile& src, const Index* mask) noexcept
 {
     auto np = src.numParticles();
     Gpu::DeviceVector<Index> offsets(np);
@@ -309,7 +312,8 @@ Index filterParticles (PTile& dst, const PTile& src, const Index* mask) noexcept
 /**
  * \brief Conditionally copy particles from src to dst based on a predicate.
  * 
- * \tparam PTile the particle tile type
+ * \tparam DstTile the dst particle tile type
+ * \tparam SrcTile the src particle tile type
  * \tparam Pred a function object
  *
  * \param dst the destination tile
@@ -317,11 +321,11 @@ Index filterParticles (PTile& dst, const PTile& src, const Index* mask) noexcept
  * \param p predicate function - particles will be copied if p returns true
  *
  */    
-template <typename PTile, typename Pred>
-auto filterParticles (PTile& dst, const PTile& src, Pred&& p) noexcept
-    -> decltype(p(typename PTile::ConstParticleTileDataType(), 0))
+template <typename DstTile, typename SrcTile, typename Pred>
+auto filterParticles (DstTile& dst, const SrcTile& src, Pred&& p) noexcept
+    -> decltype(p(typename SrcTile::ConstParticleTileDataType(), 0))
 {
-    using Index = decltype(p(typename PTile::ConstParticleTileDataType(), 0));
+    using Index = decltype(p(typename SrcTile::ConstParticleTileDataType(), 0));
     auto np = src.numParticles();
     Gpu::DeviceVector<Index> mask(np);
 
diff --git a/Src/Particle/AMReX_Particles.H b/Src/Particle/AMReX_Particles.H
index 54e4dee55c9..1c89f814622 100644
--- a/Src/Particle/AMReX_Particles.H
+++ b/Src/Particle/AMReX_Particles.H
@@ -113,9 +113,9 @@ struct ParticleInitType
     std::array<double, NArrayReal > real_array_data;
     std::array<int,    NArrayInt  > int_array_data;
 };
-    
+
 class ParticleContainerBase {};
-    
+
 /**
  * \brief A distributed container for Particles sorted onto the levels, grids,
  * and tiles of a block-structured AMR hierarachy.
@@ -151,7 +151,7 @@ public:
     using ParticleType = Particle<NStructReal, NStructInt>;
     //! \brief The type of the "SuperParticle" which stored all components in AoS form
     using SuperParticleType = Particle<NStructReal+NArrayReal, NStructInt+NArrayInt>;
-    //! \brief The type of the Real data 
+    //! \brief The type of the Real data
     using RealType = typename Particle<NStructReal, NStructInt>::RealType;
 
 #ifdef AMREX_SINGLE_PRECISION_PARTICLES
@@ -188,7 +188,7 @@ public:
       m_gdb(nullptr),
       m_runtime_comps_defined(false),
       m_num_runtime_real(0),
-      m_num_runtime_int(0)        
+      m_num_runtime_int(0)
     {
         Initialize ();
     }
@@ -199,7 +199,7 @@ public:
     //! \param gdb A pointer to a ParGDBBase, which contains pointers to the Geometry,
     //! DistributionMapping, and BoxArray objects that define the AMR hierarchy. Usually,
     //! this is generated by an AmrCore or AmrLevel object.
-    //! 
+    //!
     ParticleContainer (ParGDBBase* gdb)
         :
         communicate_real_comp(NArrayReal, true),
@@ -997,7 +997,7 @@ public:
         m_particles[lev][index].define(NumRuntimeRealComps(), NumRuntimeIntComps());
         return ParticlesAt(lev, iter);
     }
-    
+
     /**
     * \brief Functions depending the layout of the data.  Use with caution.
     *
@@ -1066,22 +1066,21 @@ public:
     static bool do_tiling;
     static IntVect tile_size;
 
-    void SetLevelDirectoriesCreated(bool tf) {
-      levelDirectoriesCreated = tf;
-    }
-
-    void SetUsePrePost(bool tf) {
+    void SetUsePrePost (bool tf) const {
       usePrePost = tf;
     }
-    bool GetUsePrePost() {
+    bool GetUsePrePost() const {
       return usePrePost;
     }
 
-    void SetUseUnlink(bool tf) {
+    int GetMaxNextIDPrePost () const { return maxnextidPrePost; }
+    Long GetNParticlesPrePost () const { return nparticlesPrePost; }
+
+    void SetUseUnlink (bool tf) const {
       doUnlink = tf;
     }
 
-    bool GetUseUnlink() {
+    bool GetUseUnlink() const {
       return doUnlink;
     }
 
@@ -1126,6 +1125,25 @@ public:
 
     bool OnSameGrids (int level, const MultiFab& mf) const { return m_gdb->OnSameGrids(level, mf); }
 
+    int NumRuntimeRealComps () const { return m_num_runtime_real; }
+    int NumRuntimeIntComps  () const { return m_num_runtime_int;  }
+
+    int NumRealComps () const { return NArrayReal + NumRuntimeRealComps(); }
+    int NumIntComps  () const { return NArrayInt  + NumRuntimeIntComps() ; }
+
+    //! ---- variables for i/o optimization saved for pre and post checkpoint
+    mutable bool usePrePost;
+    mutable bool doUnlink;
+    int maxnextidPrePost;
+    mutable int nOutFilesPrePost;
+    Long nparticlesPrePost;
+    Vector<Long> nParticlesAtLevelPrePost;          //!< ---- [level]
+    mutable Vector<Vector<int>>  whichPrePost;      //!< ---- [level]
+    mutable Vector<Vector<int>>  countPrePost;      //!< ---- [level]
+    mutable Vector<Vector<Long>> wherePrePost;      //!< ---- [level]
+    mutable std::string HdrFileNamePrePost;
+    mutable Vector<std::string> filePrefixPrePost;
+
 protected:
 
     mutable amrex::Vector<int> neighbor_procs;
@@ -1160,11 +1178,14 @@ protected:
     bool EnforcePeriodicWhere (ParticleType& prt, ParticleLocData& pld,
                                int lev_min = 0, int lev_max = -1, int local_grid=-1) const;
 
+public:
     void
-	WriteParticles (int level, std::ofstream& ofs, int fnum,
-					Vector<int>& which, Vector<int>& count, Vector<Long>& where,
-					const Vector<int>& write_real_comp, const Vector<int>& write_int_comp,
-                        const Vector<std::map<std::pair<int, int>, Gpu::DeviceVector<int>>>& particle_io_flags) const;
+    WriteParticles (int level, std::ofstream& ofs, int fnum,
+                    Vector<int>& which, Vector<int>& count, Vector<Long>& where,
+                    const Vector<int>& write_real_comp, const Vector<int>& write_int_comp,
+                    const Vector<std::map<std::pair<int, int>, Gpu::DeviceVector<int>>>& particle_io_flags) const;
+protected:
+
 #ifdef AMREX_USE_HDF5
 void WriteParticlesHDF5 ( hid_t grp, int level, Vector<int>& count, Vector<Long>& where ) const;
 
@@ -1174,7 +1195,7 @@ void ReadParticlesHDF5 (hsize_t offset, hsize_t cnt, int grd, int lev, hid_t int
 
     template <class RTYPE>
     void ReadParticles (int cnt, int grd, int lev, std::ifstream& ifs, int finest_level_in_file);
-    
+
     void SetParticleSize ();
 
     void BuildRedistributeMask(int lev, int nghost=1) const;
@@ -1189,32 +1210,12 @@ void ReadParticlesHDF5 (hsize_t offset, hsize_t cnt, int grd, int lev, hid_t int
     ParGDBBase* m_gdb;
     ParGDB      m_gdb_object;
 
-    //! ---- variables for i/o optimization saved for pre and post checkpoint
-    bool         levelDirectoriesCreated;
-    bool         usePrePost;
-    bool         doUnlink;
-    int maxnextidPrePost;
-    mutable int nOutFilesPrePost;
-    Long nparticlesPrePost;
-    Vector<Long> nParticlesAtLevelPrePost;          //!< ---- [level]
-    mutable Vector<Vector<int>>  whichPrePost;      //!< ---- [level]
-    mutable Vector<Vector<int>>  countPrePost;      //!< ---- [level]
-    mutable Vector<Vector<Long>> wherePrePost;      //!< ---- [level]
-    mutable std::string HdrFileNamePrePost;
-    mutable Vector<std::string> filePrefixPrePost;
-
     DenseBins<ParticleType> m_bins;
-    
+
 #ifdef AMREX_USE_GPU
     mutable AmrParticleLocator<DenseBins<Box> > m_particle_locator;
 #endif
 
-    int NumRuntimeRealComps () const { return m_num_runtime_real; }
-    int NumRuntimeIntComps  () const { return m_num_runtime_int;  } 
-    
-    int NumRealComps () const { return NArrayReal + NumRuntimeRealComps(); }
-    int NumIntComps  () const { return NArrayInt  + NumRuntimeIntComps() ; } 
-    
 private:
 
     virtual void particlePostLocate(ParticleType& /*p*/, const ParticleLocData& /*pld*/,
@@ -1234,7 +1235,7 @@ private:
     bool m_runtime_comps_defined;
     int m_num_runtime_real;
     int m_num_runtime_int;
-    
+
     size_t particle_size, superparticle_size;
     int num_real_comm_comps, num_int_comm_comps;
     Vector<ParticleLevel> m_particles;
diff --git a/Src/Particle/AMReX_WriteBinaryParticleData.H b/Src/Particle/AMReX_WriteBinaryParticleData.H
new file mode 100644
index 00000000000..7b61438870b
--- /dev/null
+++ b/Src/Particle/AMReX_WriteBinaryParticleData.H
@@ -0,0 +1,720 @@
+#ifndef AMREX_WRITE_BINARY_PARTICLE_DATA_H
+#define AMREX_WRITE_BINARY_PARTICLE_DATA_H
+
+#include <AMReX_TypeTraits.H>
+#include <AMReX_Particles.H>
+
+// note - namespace
+
+struct KeepValidFilter
+{    
+    template <typename SrcData>
+    AMREX_GPU_HOST_DEVICE
+    int operator() (const SrcData& src, int i) const noexcept
+    {
+        return (src.m_aos[i].id() > 0);
+    }
+};
+
+template <typename ParticleReal>
+std::size_t PSizeInFile (const Vector<int>& wrc, const Vector<int>& wic)
+{
+    std::size_t rsize = sizeof(ParticleReal)*std::accumulate(wrc.begin(), wrc.end(), 0);
+    std::size_t isize = sizeof(int)*std::accumulate(wic.begin(), wic.end(), 0);
+    return rsize + isize + AMREX_SPACEDIM*sizeof(ParticleReal) + 2*sizeof(int);
+}
+
+template <class PC, class F, EnableIf_t<IsParticleContainer<PC>::value, int> foo = 0>
+void WriteBinaryParticleDataSync (PC const& pc,
+                                  const std::string& dir, const std::string& name,
+                                  const Vector<int>& write_real_comp,
+                                  const Vector<int>& write_int_comp,
+                                  const Vector<std::string>& real_comp_names,
+                                  const Vector<std::string>& int_comp_names,
+                                  F&& f)
+{
+    BL_PROFILE("WriteBinaryParticleData()");
+    AMREX_ASSERT(pc.OK());
+
+    AMREX_ASSERT(sizeof(typename PC::ParticleType::RealType) == 4 ||
+                 sizeof(typename PC::ParticleType::RealType) == 8);
+
+    constexpr int NStructReal = PC::NStructReal;
+    constexpr int NStructInt  = PC::NStructInt;
+
+    const int NProcs = ParallelDescriptor::NProcs();
+    const int IOProcNumber = ParallelDescriptor::IOProcessorNumber();
+
+    AMREX_ALWAYS_ASSERT(real_comp_names.size() == pc.NumRealComps() + NStructReal);
+    AMREX_ALWAYS_ASSERT( int_comp_names.size() == pc.NumIntComps() + NStructInt);
+
+    std::string pdir = dir;
+    if ( not pdir.empty() and pdir[pdir.size()-1] != '/') pdir += '/';
+    pdir += name;
+
+    if (ParallelDescriptor::IOProcessor())
+    {
+        if ( ! amrex::UtilCreateDirectory(pdir, 0755))
+        {
+            amrex::CreateDirectoryFailed(pdir);
+        }
+    }
+    ParallelDescriptor::Barrier();
+
+    std::ofstream HdrFile;
+
+    Long nparticles = 0;
+    int maxnextid;
+
+    // evaluate f for every particle to determine which ones to output
+    Vector<std::map<std::pair<int, int>, Gpu::DeviceVector<int> > > particle_io_flags(pc.GetParticles().size());
+    for (int lev = 0; lev < pc.GetParticles().size();  lev++)
+    {
+        const auto& pmap = pc.GetParticles(lev);
+        for (const auto& kv : pmap)
+        {
+            const auto ptd = kv.second.getConstParticleTileData();
+            const auto np = kv.second.numParticles();
+            particle_io_flags[lev][kv.first].resize(np, 0);
+            auto pflags = particle_io_flags[lev][kv.first].data();
+            AMREX_HOST_DEVICE_FOR_1D( np, k,
+            {
+                const auto p = ptd.getSuperParticle(k);
+                pflags[k] = f(p);
+            });
+        }
+    }
+
+    Gpu::Device::synchronize();
+
+    if(pc.GetUsePrePost())
+    {
+        nparticles = pc.GetNParticlesPrePost();
+        maxnextid  = pc.GetMaxNextIDPrePost();
+    }
+    else
+    {
+        nparticles = 0;
+        maxnextid  = PC::ParticleType::NextID();
+
+        for (int lev = 0; lev < pc.GetParticles().size();  lev++)
+        {
+            const auto& pmap = pc.GetParticles(lev);
+            for (const auto& kv : pmap)
+            {
+                const auto& pflags = particle_io_flags[lev][kv.first];
+                for (int k = 0; k < kv.second.numParticles(); ++k)
+                {
+                    if (pflags[k]) nparticles++;
+                }
+            }
+        }
+
+        ParallelDescriptor::ReduceLongSum(nparticles, IOProcNumber);
+        PC::ParticleType::NextID(maxnextid);
+        ParallelDescriptor::ReduceIntMax(maxnextid, IOProcNumber);
+    }
+
+    if (ParallelDescriptor::IOProcessor())
+    {
+        std::string HdrFileName = pdir;
+
+        if ( ! HdrFileName.empty() && HdrFileName[HdrFileName.size()-1] != '/')
+            HdrFileName += '/';
+
+        HdrFileName += "Header";
+        pc.HdrFileNamePrePost = HdrFileName;
+
+        HdrFile.open(HdrFileName.c_str(), std::ios::out|std::ios::trunc);
+
+        if ( ! HdrFile.good()) amrex::FileOpenFailed(HdrFileName);
+
+        //
+        // First thing written is our Checkpoint/Restart version string.
+        // We append "_single" or "_double" to the version string indicating
+        // whether we're using "float" or "double" floating point data in the
+        // particles so that we can Restart from the checkpoint files.
+        //
+        if (sizeof(typename PC::ParticleType::RealType) == 4)
+        {
+            HdrFile << PC::ParticleType::Version() << "_single" << '\n';
+        }
+        else
+        {
+            HdrFile << PC::ParticleType::Version() << "_double" << '\n';
+        }
+
+        int num_output_real = 0;
+        for (int i = 0; i < pc.NumRealComps() + NStructReal; ++i)
+            if (write_real_comp[i]) ++num_output_real;
+
+        int num_output_int = 0;
+        for (int i = 0; i < pc.NumIntComps() + NStructInt; ++i)
+            if (write_int_comp[i]) ++num_output_int;
+
+        // AMREX_SPACEDIM and N for sanity checking.
+        HdrFile << AMREX_SPACEDIM << '\n';
+
+        // The number of extra real parameters
+        HdrFile << num_output_real << '\n';
+
+        // Real component names
+        for (int i = 0; i < NStructReal + pc.NumRealComps(); ++i )
+            if (write_real_comp[i]) HdrFile << real_comp_names[i] << '\n';
+
+        // The number of extra int parameters
+        HdrFile << num_output_int << '\n';
+
+        // int component names
+        for (int i = 0; i < NStructInt + pc.NumIntComps(); ++i )
+            if (write_int_comp[i]) HdrFile << int_comp_names[i] << '\n';
+
+        bool is_checkpoint = true; // legacy
+        HdrFile << is_checkpoint << '\n';
+
+        // The total number of particles.
+        HdrFile << nparticles << '\n';
+
+        // The value of nextid that we need to restore on restart.
+        HdrFile << maxnextid << '\n';
+
+        // Then the finest level of the AMR hierarchy.
+        HdrFile << pc.finestLevel() << '\n';
+
+        // Then the number of grids at each level.
+        for (int lev = 0; lev <= pc.finestLevel(); lev++)
+            HdrFile << pc.ParticleBoxArray(lev).size() << '\n';
+    }
+
+    // We want to write the data out in parallel.
+    // We'll allow up to nOutFiles active writers at a time.
+    int nOutFiles(256);
+
+    ParmParse pp("particles");
+    pp.query("particles_nfiles",nOutFiles);
+    if(nOutFiles == -1) nOutFiles = NProcs;
+    nOutFiles = std::max(1, std::min(nOutFiles,NProcs));
+    pc.nOutFilesPrePost = nOutFiles;
+
+    for (int lev = 0; lev <= pc.finestLevel(); lev++)
+    {
+        bool gotsome;
+        if(pc.usePrePost)
+        {
+            gotsome = (pc.nParticlesAtLevelPrePost[lev] > 0);
+        }
+        else
+        {
+            gotsome = (pc.NumberOfParticlesAtLevel(lev) > 0);
+        }
+
+        // We store the particles at each level in their own subdirectory.
+        std::string LevelDir = pdir;
+
+        if (gotsome)
+        {
+            if ( ! LevelDir.empty() && LevelDir[LevelDir.size()-1] != '/') LevelDir += '/';
+
+            LevelDir = amrex::Concatenate(LevelDir + "Level_", lev, 1);
+
+            if (ParallelDescriptor::IOProcessor())
+                if ( ! amrex::UtilCreateDirectory(LevelDir, 0755))
+                    amrex::CreateDirectoryFailed(LevelDir);
+            ParallelDescriptor::Barrier();
+        }
+
+        // Write out the header for each particle
+        if (gotsome and ParallelDescriptor::IOProcessor()) {
+            std::string HeaderFileName = LevelDir;
+            HeaderFileName += "/Particle_H";
+            std::ofstream ParticleHeader(HeaderFileName);
+
+            pc.ParticleBoxArray(lev).writeOn(ParticleHeader);
+            ParticleHeader << '\n';
+
+            ParticleHeader.flush();
+            ParticleHeader.close();
+        }
+
+        MFInfo info;
+        info.SetAlloc(false);
+        MultiFab state(pc.ParticleBoxArray(lev),
+                       pc.ParticleDistributionMap(lev),
+                       1,0,info);
+
+        // We eventually want to write out the file name and the offset
+        // into that file into which each grid of particles is written.
+        Vector<int>  which(state.size(),0);
+        Vector<int > count(state.size(),0);
+        Vector<Long> where(state.size(),0);
+
+        std::string filePrefix(LevelDir);
+        filePrefix += '/';
+        filePrefix += PC::ParticleType::DataPrefix();
+        if(pc.usePrePost) {
+            pc.filePrefixPrePost[lev] = filePrefix;
+        }
+        bool groupSets(false), setBuf(true);
+
+        if (gotsome)
+        {
+            for(NFilesIter nfi(nOutFiles, filePrefix, groupSets, setBuf); nfi.ReadyToWrite(); ++nfi)
+            {
+                std::ofstream& myStream = (std::ofstream&) nfi.Stream();
+                pc.WriteParticles(lev, myStream, nfi.FileNumber(), which, count, where,
+                                  write_real_comp, write_int_comp, particle_io_flags);
+            }
+
+            if(pc.usePrePost) {
+                pc.whichPrePost[lev] = which;
+                pc.countPrePost[lev] = count;
+                pc.wherePrePost[lev] = where;
+            } else {
+                ParallelDescriptor::ReduceIntSum (which.dataPtr(), which.size(), IOProcNumber);
+                ParallelDescriptor::ReduceIntSum (count.dataPtr(), count.size(), IOProcNumber);
+                ParallelDescriptor::ReduceLongSum(where.dataPtr(), where.size(), IOProcNumber);
+            }
+        }
+
+        if (ParallelDescriptor::IOProcessor())
+        {
+            if(pc.GetUsePrePost()) {
+                // ---- write to the header and unlink in CheckpointPost
+            } else {
+                for (int j = 0; j < state.size(); j++)
+                {
+                    HdrFile << which[j] << ' ' << count[j] << ' ' << where[j] << '\n';
+                }
+
+                if (gotsome && pc.doUnlink)
+                {
+                    // Unlink any zero-length data files.
+                    Vector<Long> cnt(nOutFiles,0);
+
+                    for (int i = 0, N=count.size(); i < N; i++) {
+                        cnt[which[i]] += count[i];
+                    }
+
+                    for (int i = 0, N=cnt.size(); i < N; i++)
+                    {
+                        if (cnt[i] == 0)
+                        {
+                            std::string FullFileName = NFilesIter::FileName(i, filePrefix);
+                            FileSystem::Remove(FullFileName);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (ParallelDescriptor::IOProcessor())
+    {
+        HdrFile.flush();
+        HdrFile.close();
+        if ( ! HdrFile.good())
+        {
+            amrex::Abort("ParticleContainer::Checkpoint(): problem writing HdrFile");
+        }
+    }
+}
+
+template <class PC, EnableIf_t<IsParticleContainer<PC>::value, int> foo = 0>
+void WriteBinaryParticleDataAsync (PC const& pc,
+                                   const std::string& dir, const std::string& name,
+                                   const Vector<int>& write_real_comp,
+                                   const Vector<int>& write_int_comp,
+                                   const Vector<std::string>& real_comp_names,
+                                   const Vector<std::string>& int_comp_names)
+{
+    BL_PROFILE("WriteBinaryParticleDataAsync");
+    AMREX_ASSERT(pc.OK());
+
+    AMREX_ASSERT(sizeof(typename PC::ParticleType::RealType) == 4 ||
+                 sizeof(typename PC::ParticleType::RealType) == 8);
+
+    constexpr int NStructReal = PC::NStructReal;
+    constexpr int NStructInt  = PC::NStructInt;
+    constexpr int NArrayReal  = PC::NArrayReal;
+    constexpr int NArrayInt   = PC::NArrayInt;
+
+    const int MyProc = ParallelDescriptor::MyProc();
+    const int NProcs = ParallelDescriptor::NProcs();
+    const int IOProcNumber = NProcs - 1;
+
+    AMREX_ALWAYS_ASSERT(real_comp_names.size() == pc.NumRealComps() + NStructReal);
+    AMREX_ALWAYS_ASSERT( int_comp_names.size() == pc.NumIntComps() + NStructInt);
+
+    Vector<LayoutData<Long> > np_per_grid_local(pc.finestLevel()+1);
+    for (int lev = 0; lev <= pc.finestLevel(); lev++)
+    {
+        np_per_grid_local[lev].define(pc.ParticleBoxArray(lev), pc.ParticleDistributionMap(lev));
+        for (MFIter mfi = pc.MakeMFIter(lev); mfi.isValid(); ++mfi)
+        {
+            int gid = mfi.index();
+            const auto& ptile = pc.ParticlesAt(lev, mfi);
+            const auto& aos = ptile.GetArrayOfStructs();
+            const auto pstruct = aos().dataPtr();
+            const int np = ptile.numParticles();
+
+            ReduceOps<ReduceOpSum> reduce_op;
+            ReduceData<int> reduce_data(reduce_op);
+            using ReduceTuple = typename decltype(reduce_data)::Type;
+
+            reduce_op.eval(np, reduce_data,
+            [=] AMREX_GPU_DEVICE (int i) -> ReduceTuple
+            {
+                return (pstruct[i].id() > 0) ? 1 : 0;
+            });
+
+            int np_valid = amrex::get<0>(reduce_data.value());
+            np_per_grid_local[lev][gid] += np_valid;
+        }
+    }
+
+    Vector<Vector<Long> > np_per_grid_global(pc.finestLevel()+1);
+    Long total_np = 0;
+    Vector<Long> np_per_level(pc.finestLevel()+1);
+    for (int lev = 0; lev <= pc.finestLevel(); lev++)
+    {
+        np_per_grid_global[lev].resize(np_per_grid_local[lev].size());
+        ParallelDescriptor::GatherLayoutDataToVector(np_per_grid_local[lev],
+                                                     np_per_grid_global[lev],
+                                                     IOProcNumber);
+        np_per_level[lev] = std::accumulate(np_per_grid_global[lev].begin(),
+                                            np_per_grid_global[lev].end(), 0L);
+        total_np += np_per_level[lev];
+    }
+
+    std::string pdir = dir;
+    if ( not pdir.empty() and pdir[pdir.size()-1] != '/') pdir += '/';
+    pdir += name;
+
+    if (MyProc == IOProcNumber)
+    {
+        if ( ! amrex::UtilCreateDirectory(pdir, 0755))
+        {
+            amrex::CreateDirectoryFailed(pdir);
+        }
+        for (int lev = 0; lev <= pc.finestLevel(); lev++)
+        {
+            std::string LevelDir = pdir;
+            bool gotsome = np_per_level[lev];
+
+            if (gotsome)
+            {
+                if ( ! LevelDir.empty() && LevelDir[LevelDir.size()-1] != '/') LevelDir += '/';
+
+                LevelDir = amrex::Concatenate(LevelDir + "Level_", lev, 1);
+
+                if ( ! amrex::UtilCreateDirectory(LevelDir, 0755))
+                {
+                    amrex::CreateDirectoryFailed(LevelDir);
+                }
+
+                std::string HeaderFileName = LevelDir;
+                HeaderFileName += "/Particle_H";
+                std::ofstream ParticleHeader(HeaderFileName);
+
+                pc.ParticleBoxArray(lev).writeOn(ParticleHeader);
+                ParticleHeader << '\n';
+
+                ParticleHeader.flush();
+                ParticleHeader.close();
+            }
+        }
+    }
+    ParallelDescriptor::Barrier();
+
+    int maxnextid = PC::ParticleType::NextID();
+    ParallelDescriptor::ReduceIntMax(maxnextid, IOProcNumber);
+
+    Vector<Long> np_on_rank(NProcs, 0L);
+    std::size_t psize = PSizeInFile<ParticleReal>(write_real_comp, write_int_comp);
+    Vector<int64_t> rank_start_offset(NProcs);
+    if (MyProc == IOProcNumber)
+    {
+        for (int lev = 0; lev <= pc.finestLevel(); lev++)
+        {
+            for (int k = 0; k < pc.ParticleBoxArray(lev).size(); ++k)
+            {
+                int rank = pc.ParticleDistributionMap(lev)[k];
+                np_on_rank[rank] += np_per_grid_global[lev][k];
+            }
+        }
+
+        for (int ip = 0; ip < NProcs; ++ip)
+        {
+            auto info = AsyncOut::GetWriteInfo(ip);
+            rank_start_offset[ip] = (info.ispot == 0) ? 0 : rank_start_offset[ip-1] + np_on_rank[ip-1]*psize;
+        }
+    }
+
+    // make tmp particle tiles in pinned memory to write
+    using PinnedPTile = ParticleTile<NStructReal, NStructInt, NArrayReal, NArrayInt,
+                                     PinnedArenaAllocator>;
+    auto myptiles = std::make_shared<Vector<std::map<std::pair<int, int>,PinnedPTile> > >();
+    myptiles->resize(pc.finestLevel()+1);
+    for (int lev = 0; lev <= pc.finestLevel(); lev++)
+    {
+        for (MFIter mfi = pc.MakeMFIter(lev); mfi.isValid(); ++mfi)
+        {
+            auto& new_ptile = (*myptiles)[lev][std::make_pair(mfi.index(),
+                                                              mfi.LocalTileIndex())];
+
+            if (np_per_grid_local[lev][mfi.index()] > 0)
+            {
+                const auto& ptile = pc.ParticlesAt(lev, mfi);
+                const auto& aos = ptile.GetArrayOfStructs();
+                const auto pstruct = aos().dataPtr();
+                new_ptile.resize(np_per_grid_local[lev][mfi.index()]);
+                amrex::filterParticles(new_ptile, ptile, KeepValidFilter());
+            }
+        }
+    }
+
+    int finest_level = pc.finestLevel();
+    Vector<BoxArray> bas;
+    Vector<DistributionMapping> dms;
+    for (int lev = 0; lev <= pc.finestLevel(); lev++)
+    {
+        bas.push_back(pc.ParticleBoxArray(lev));
+        dms.push_back(pc.ParticleDistributionMap(lev));
+    }
+
+    int nrc = pc.NumRealComps();
+    int nic = pc.NumIntComps();
+
+    auto RD = pc.ParticleRealDescriptor;
+
+    AsyncOut::Submit([=] ()
+    {
+        if (MyProc == IOProcNumber)
+        {
+            std::string HdrFileName = pdir;
+            std::ofstream HdrFile;
+
+            if ( ! HdrFileName.empty() && HdrFileName[HdrFileName.size()-1] != '/')
+                HdrFileName += '/';
+
+            HdrFileName += "Header";
+
+            HdrFile.open(HdrFileName.c_str(), std::ios::out|std::ios::trunc);
+
+            if ( ! HdrFile.good()) amrex::FileOpenFailed(HdrFileName);
+
+            if (sizeof(typename PC::ParticleType) == 4)
+            {
+                HdrFile << PC::ParticleType::Version() << "_single" << '\n';
+            }
+            else
+            {
+                HdrFile << PC::ParticleType::Version() << "_double" << '\n';
+            }
+
+            int num_output_real = 0;
+            for (int i = 0; i < nrc + NStructReal; ++i)
+                if (write_real_comp[i]) ++num_output_real;
+
+            int num_output_int = 0;
+            for (int i = 0; i < nic + NStructInt; ++i)
+                if (write_int_comp[i]) ++num_output_int;
+
+            // AMREX_SPACEDIM and N for sanity checking.
+            HdrFile << AMREX_SPACEDIM << '\n';
+
+            // The number of extra real parameters
+            HdrFile << num_output_real << '\n';
+
+            // Real component names
+            for (int i = 0; i < NStructReal + nrc; ++i )
+                if (write_real_comp[i]) HdrFile << real_comp_names[i] << '\n';
+
+            // The number of extra int parameters
+            HdrFile << num_output_int << '\n';
+
+            // int component names
+            for (int i = 0; i < NStructInt + nic; ++i )
+                if (write_int_comp[i]) HdrFile << int_comp_names[i] << '\n';
+
+            bool is_checkpoint = true; // legacy
+            HdrFile << is_checkpoint << '\n';
+
+            // The total number of particles.
+            HdrFile << total_np << '\n';
+
+            // The value of nextid that we need to restore on restart.
+            HdrFile << maxnextid << '\n';
+
+            // Then the finest level of the AMR hierarchy.
+            HdrFile << finest_level << '\n';
+
+            // Then the number of grids at each level.
+            for (int lev = 0; lev <= finest_level; lev++)
+                HdrFile << dms[lev].size() << '\n';
+
+            for (int lev = 0; lev <= finest_level; lev++)
+            {
+                Vector<int64_t> grid_offset(NProcs, 0);
+                for (int k = 0; k < bas[lev].size(); ++k)
+                {
+                    int rank = dms[lev][k];
+                    auto info = AsyncOut::GetWriteInfo(rank);
+                    HdrFile << info.ifile << ' '
+                            << np_per_grid_global[lev][k] << ' '
+                            << grid_offset[rank] + rank_start_offset[rank] << '\n';
+                    grid_offset[rank] += np_per_grid_global[lev][k]*psize;
+                }
+            }
+
+            HdrFile.flush();
+            HdrFile.close();
+            if ( ! HdrFile.good())
+            {
+                amrex::Abort("ParticleContainer::Checkpoint(): problem writing HdrFile");
+            }
+        }
+
+        AsyncOut::Wait();  // Wait for my turn
+
+        for (int lev = 0; lev <= finest_level; lev++)
+        {
+            // For a each grid, the tiles it contains
+            std::map<int, Vector<int> > tile_map;
+
+            for (const auto& kv : (*myptiles)[lev])
+            {
+                const int grid = kv.first.first;
+                const int tile = kv.first.second;
+                tile_map[grid].push_back(tile);
+            }
+
+            std::string LevelDir = pdir;
+            if ( ! LevelDir.empty() && LevelDir[LevelDir.size()-1] != '/') LevelDir += '/';
+            LevelDir = amrex::Concatenate(LevelDir + "Level_", lev, 1);
+            std::string filePrefix(LevelDir);
+            filePrefix += '/';
+            filePrefix += PC::ParticleType::DataPrefix();
+            auto info = AsyncOut::GetWriteInfo(MyProc);
+            std::string file_name = amrex::Concatenate(filePrefix, info.ifile, 5);
+            std::ofstream ofs;
+            ofs.open(file_name.c_str(), (info.ispot == 0) ? (std::ios::binary | std::ios::trunc)
+                     : (std::ios::binary | std::ios::app));
+
+            for (int k = 0; k < bas[lev].size(); ++k)
+            {
+                int rank = dms[lev][k];
+                if (rank != MyProc) continue;
+                const int grid = k;
+                if (np_per_grid_local[lev][grid] == 0) continue;
+
+                // First write out the integer data in binary.
+                int num_output_int = 0;
+                for (int i = 0; i < nic + NStructInt; ++i)
+                    if (write_int_comp[i]) ++num_output_int;
+
+                const int iChunkSize = 2 + num_output_int;
+                Vector<int> istuff(np_per_grid_local[lev][grid]*iChunkSize);
+                int* iptr = istuff.dataPtr();
+
+                for (unsigned i = 0; i < tile_map[grid].size(); i++) {
+                    auto ptile_index = std::make_pair(grid, tile_map[grid][i]);
+                    const auto& pbox = (*myptiles)[lev][ptile_index];
+                    for (int pindex = 0;
+                         pindex < pbox.GetArrayOfStructs().numParticles(); ++pindex)
+                    {
+                        const auto& aos = pbox.GetArrayOfStructs();
+                        const auto& p = aos[pindex];
+
+                        if (p.id() <= 0) continue;
+
+                        // always write these
+                        *iptr = p.id(); ++iptr;
+                        *iptr = p.cpu(); ++iptr;
+
+                        // optionally write these
+                        for (int j = 0; j < NStructInt; j++)
+                        {
+                            if (write_int_comp[j])
+                            {
+                                *iptr = p.idata(j);
+                                ++iptr;
+                            }
+                        }
+
+                        const auto& soa  = pbox.GetStructOfArrays();
+                        for (int j = 0; j < nic; j++)
+                        {
+                            if (write_int_comp[NStructInt+j])
+                            {
+                                *iptr = soa.GetIntData(j)[pindex];
+                                ++iptr;
+                            }
+                        }
+                    }
+                }
+
+                writeIntData(istuff.dataPtr(), istuff.size(), ofs);
+                ofs.flush();  // Some systems require this flush() (probably due to a bug)
+
+                // Write the Real data in binary.
+                int num_output_real = 0;
+                for (int i = 0; i < nrc + NStructReal; ++i)
+                    if (write_real_comp[i]) ++num_output_real;
+
+                const int rChunkSize = AMREX_SPACEDIM + num_output_real;
+                Vector<typename PC::ParticleType::RealType> rstuff(np_per_grid_local[lev][grid]*rChunkSize);
+                typename PC::ParticleType::RealType* rptr = rstuff.dataPtr();
+
+                for (unsigned i = 0; i < tile_map[grid].size(); i++) {
+                    auto ptile_index = std::make_pair(grid, tile_map[grid][i]);
+                    const auto& pbox = (*myptiles)[lev][ptile_index];
+                    for (int pindex = 0;
+                         pindex < pbox.GetArrayOfStructs().numParticles(); ++pindex)
+                    {
+                        const auto& aos = pbox.GetArrayOfStructs();
+                        const auto& p = aos[pindex];
+
+                        if (p.id() <= 0) continue;
+
+                        // always write these
+                        for (int j = 0; j < AMREX_SPACEDIM; j++) rptr[j] = p.pos(j);
+                        rptr += AMREX_SPACEDIM;
+
+                        // optionally write these
+                        for (int j = 0; j < NStructReal; j++)
+                        {
+                            if (write_real_comp[j])
+                            {
+                                *rptr = p.rdata(j);
+                                ++rptr;
+                            }
+                        }
+
+                        const auto& soa  = pbox.GetStructOfArrays();
+                        for (int j = 0; j < nrc; j++)
+                        {
+                            if (write_real_comp[NStructReal+j])
+                            {
+                                *rptr = (typename PC::ParticleType::RealType) soa.GetRealData(j)[pindex];
+                                ++rptr;
+                            }
+                        }
+                    }
+                }
+
+                if (sizeof(typename PC::ParticleType::RealType) == 4) {
+                    writeFloatData((float*) rstuff.dataPtr(), rstuff.size(), ofs, RD);
+                }
+                else if (sizeof(typename PC::ParticleType::RealType) == 8) {
+                    writeDoubleData((double*) rstuff.dataPtr(), rstuff.size(), ofs, RD);
+                }
+
+                ofs.flush();  // Some systems require this flush() (probably due to a bug)
+            }
+        }
+        AsyncOut::Notify();  // Notify others I am done
+    });
+}
+
+#endif
diff --git a/Src/Particle/CMakeLists.txt b/Src/Particle/CMakeLists.txt
index 8a220e4208c..31315e27ff1 100644
--- a/Src/Particle/CMakeLists.txt
+++ b/Src/Particle/CMakeLists.txt
@@ -42,4 +42,5 @@ target_sources( amrex
    AMReX_DenseBins.H
    AMReX_BinIterator.H
    AMReX_ParticleTransformation.H
+   AMReX_WriteBinaryParticleData.H
    )
diff --git a/Src/Particle/Make.package b/Src/Particle/Make.package
index 1987f9a2042..0646d5e07ae 100644
--- a/Src/Particle/Make.package
+++ b/Src/Particle/Make.package
@@ -10,6 +10,7 @@ C$(AMREX_PARTICLE)_headers += AMReX_ParIter.H AMReX_ParticleMPIUtil.H AMReX_Stru
 C$(AMREX_PARTICLE)_headers += AMReX_ParticleUtil.H AMReX_NeighborList.H AMReX_ParticleBufferMap.H AMReX_ParticleCommunication.H AMReX_ParticleReduce.H AMReX_ParticleLocator.H
 C$(AMREX_PARTICLE)_headers += AMReX_NeighborParticlesCPUImpl.H AMReX_NeighborParticlesGPUImpl.H
 C$(AMREX_PARTICLE)_headers += AMReX_Particle_mod_K.H AMReX_TracerParticle_mod_K.H AMReX_ParticleMesh.H AMReX_ParticleIO.H AMReX_ParticleHDF5.H AMReX_DenseBins.H AMReX_ParticleTransformation.H AMReX_SparseBins.H AMReX_BinIterator.H
+C$(AMREX_PARTICLE)_headers += AMReX_WriteBinaryParticleData.H
 
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/Particle
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Particle
diff --git a/Tests/Particles/AsyncIO/GNUmakefile b/Tests/Particles/AsyncIO/GNUmakefile
new file mode 100644
index 00000000000..d227a7b2656
--- /dev/null
+++ b/Tests/Particles/AsyncIO/GNUmakefile
@@ -0,0 +1,32 @@
+AMREX_HOME ?= ../../../
+
+DEBUG	= TRUE
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+TINY_PROFILE = TRUE
+USE_PARTICLES = TRUE
+
+PRECISION = DOUBLE
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+
+MPI_THREAD_MULTIPLE = TRUE
+
+###################################################
+
+EBASE     = main
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+include $(AMREX_HOME)/Src/Boundary/Make.package
+include $(AMREX_HOME)/Src/AmrCore/Make.package
+include $(AMREX_HOME)/Src/Particle/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/Particles/AsyncIO/Make.package b/Tests/Particles/AsyncIO/Make.package
new file mode 100644
index 00000000000..7f43e5e87cb
--- /dev/null
+++ b/Tests/Particles/AsyncIO/Make.package
@@ -0,0 +1,2 @@
+CEXE_sources += main.cpp
+
diff --git a/Tests/Particles/AsyncIO/inputs b/Tests/Particles/AsyncIO/inputs
new file mode 100644
index 00000000000..9d81c23f8a3
--- /dev/null
+++ b/Tests/Particles/AsyncIO/inputs
@@ -0,0 +1,29 @@
+
+# Domain size
+
+#nx = 32 # number of grid points along the x axis
+#ny = 32 # number of grid points along the y axis 
+#nz = 32 # number of grid points along the z axis
+
+#nx = 64 # number of grid points along the x axis
+#ny = 64 # number of grid points along the y axis 
+#nz = 64 # number of grid points along the z axis
+
+nx = 128 # number of grid points along the x axis
+ny = 128 # number of grid points along the y axis 
+nz = 128 # number of grid points along the z axis
+
+# Maximum allowable size of each subdomain in the problem domain; 
+#    this is used to decompose the domain for parallel calculations.
+max_grid_size = 16
+
+# Number of particles per cell
+nppc = 10
+
+# Verbosity
+verbose = true   # set to true to get more verbosity 
+
+# Number of levels
+nlevs = 2
+
+amrex.async_io=1
\ No newline at end of file
diff --git a/Tests/Particles/AsyncIO/main.cpp b/Tests/Particles/AsyncIO/main.cpp
new file mode 100644
index 00000000000..7fffa7a9f3b
--- /dev/null
+++ b/Tests/Particles/AsyncIO/main.cpp
@@ -0,0 +1,312 @@
+#include <iostream>
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_MultiFabUtil.H>
+#include <AMReX_AmrParticles.H>
+#include <AMReX_PlotFileUtil.H>
+
+#include <thread>
+
+using namespace amrex;
+
+static constexpr int NSR = 1 + AMREX_SPACEDIM;
+static constexpr int NSI = 0;
+static constexpr int NAR = 0;
+static constexpr int NAI = 0;
+
+struct TestParams {
+  int nx;
+  int ny;
+  int nz;
+  int max_grid_size;
+  int nppc;
+  int nlevs;
+  bool verbose;
+};
+
+void get_position_unit_cell(Real* r, const IntVect& nppc, int i_part)
+{
+    int nx = nppc[0];
+#if AMREX_SPACEDIM > 1
+    int ny = nppc[1];
+#else
+    int ny = 1;
+#endif
+#if AMREX_SPACEDIM > 2
+    int nz = nppc[2];
+#else
+    int nz = 1;
+#endif
+
+    int ix_part = i_part/(ny * nz);
+    int iy_part = (i_part % (ny * nz)) % ny;
+    int iz_part = (i_part % (ny * nz)) / ny;
+
+    r[0] = (0.5+ix_part)/nx;
+    r[1] = (0.5+iy_part)/ny;
+    r[2] = (0.5+iz_part)/nz;
+}
+
+class MyParticleContainer
+    : public amrex::ParticleContainer<NSR, NSI, NAR, NAI>
+{
+
+public:
+
+    MyParticleContainer (const Vector<amrex::Geometry>            & a_geom,
+                         const Vector<amrex::DistributionMapping> & a_dmap,
+                         const Vector<amrex::BoxArray>            & a_ba,
+                         const Vector<amrex::IntVect>             & a_rr)
+        : amrex::ParticleContainer<NSR, NSI, NAR, NAI>(a_geom, a_dmap, a_ba, a_rr)
+    {}
+
+    void InitParticles (const amrex::IntVect& a_num_particles_per_cell)
+    {
+        BL_PROFILE("InitParticles");
+
+        const int lev = 0;  // only add particles on level 0
+        const Real* dx = Geom(lev).CellSize();
+        const Real* plo = Geom(lev).ProbLo();
+
+        const int num_ppc = AMREX_D_TERM( a_num_particles_per_cell[0],
+                                         *a_num_particles_per_cell[1],
+                                         *a_num_particles_per_cell[2]);
+
+        for(MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi)
+        {
+            const Box& tile_box  = mfi.tilebox();
+
+            Gpu::HostVector<ParticleType> host_particles;
+            std::array<Gpu::HostVector<Real>, NAR> host_real;
+            std::array<Gpu::HostVector<int>, NAI> host_int;
+
+            std::vector<Gpu::HostVector<Real> > host_runtime_real(NumRuntimeRealComps());
+            std::vector<Gpu::HostVector<int> > host_runtime_int(NumRuntimeIntComps());
+
+            for (IntVect iv = tile_box.smallEnd(); iv <= tile_box.bigEnd(); tile_box.next(iv))
+            {
+                for (int i_part=0; i_part<num_ppc;i_part++) {
+                    Real r[3];
+                    get_position_unit_cell(r, a_num_particles_per_cell, i_part);
+
+                    ParticleType p;
+                    p.id()  = ParticleType::NextID();
+                    p.cpu() = ParallelDescriptor::MyProc();
+                    p.pos(0) = plo[0] + (iv[0] + r[0])*dx[0];
+#if AMREX_SPACEDIM > 1
+                    p.pos(1) = plo[1] + (iv[1] + r[1])*dx[1];
+#endif
+#if AMREX_SPACEDIM > 2
+                    p.pos(2) = plo[2] + (iv[2] + r[2])*dx[2];
+#endif
+
+                    for (int i = 0; i < NSR; ++i) p.rdata(i) = p.id();
+                    for (int i = 0; i < NSI; ++i) p.idata(i) = p.id();
+
+                    host_particles.push_back(p);
+                    for (int i = 0; i < NAR; ++i)
+                        host_real[i].push_back(p.id());
+                    for (int i = 0; i < NAI; ++i)
+                        host_int[i].push_back(p.id());
+                    for (int i = 0; i < NumRuntimeRealComps(); ++i)
+                        host_runtime_real[i].push_back(p.id());
+                    for (int i = 0; i < NumRuntimeIntComps(); ++i)
+                        host_runtime_int[i].push_back(p.id());
+                }
+            }
+
+            auto& particle_tile = DefineAndReturnParticleTile(lev, mfi.index(), mfi.LocalTileIndex());
+            auto old_size = particle_tile.GetArrayOfStructs().size();
+            auto new_size = old_size + host_particles.size();
+            particle_tile.resize(new_size);
+
+            Gpu::copy(Gpu::hostToDevice,
+                      host_particles.begin(),
+                      host_particles.end(),
+                      particle_tile.GetArrayOfStructs().begin() + old_size);
+
+            auto& soa = particle_tile.GetStructOfArrays();
+            for (int i = 0; i < NAR; ++i)
+            {
+                Gpu::copy(Gpu::hostToDevice,
+                          host_real[i].begin(),
+                          host_real[i].end(),
+                          soa.GetRealData(i).begin() + old_size);
+            }
+
+            for (int i = 0; i < NAI; ++i)
+            {
+                Gpu::copy(Gpu::hostToDevice,
+                          host_int[i].begin(),
+                          host_int[i].end(),
+                          soa.GetIntData(i).begin() + old_size);
+            }
+            for (int i = 0; i < NumRuntimeRealComps(); ++i)
+            {
+                Gpu::copy(Gpu::hostToDevice,
+                          host_runtime_real[i].begin(),
+                          host_runtime_real[i].end(),
+                          soa.GetRealData(NAR+i).begin() + old_size);
+            }
+
+            for (int i = 0; i < NumRuntimeIntComps(); ++i)
+            {
+                Gpu::copy(Gpu::hostToDevice,
+                          host_runtime_int[i].begin(),
+                          host_runtime_int[i].end(),
+                          soa.GetIntData(NAI+i).begin() + old_size);
+            }
+        }
+
+        Redistribute();
+    }
+};
+
+void test_async_io(TestParams& parms)
+{
+    int nlevs = parms.nlevs;
+
+    RealBox real_box;
+    for (int n = 0; n < BL_SPACEDIM; n++) {
+        real_box.setLo(n, 0.0);
+        real_box.setHi(n, 1.0);
+    }
+
+    RealBox fine_box;
+    for (int n = 0; n < BL_SPACEDIM; n++)
+    {
+       fine_box.setLo(n,0.25);
+       fine_box.setHi(n,0.75);
+    }
+
+    IntVect domain_lo(D_DECL(0 , 0, 0));
+    IntVect domain_hi(D_DECL(parms.nx - 1, parms.ny - 1, parms.nz-1));
+    const Box domain(domain_lo, domain_hi);
+
+    // Define the refinement ratio
+    Vector<IntVect> rr(nlevs-1);
+    for (int lev = 1; lev < nlevs; lev++)
+        rr[lev-1] = IntVect(AMREX_D_DECL(2, 2, 2));
+
+    // This sets the boundary conditions to be doubly or triply periodic
+    int is_per[BL_SPACEDIM];
+    for (int i = 0; i < BL_SPACEDIM; i++)
+        is_per[i] = 1;
+
+    // This defines a Geometry object which is useful for writing the plotfiles
+    Vector<Geometry> geom(nlevs);
+    geom[0].define(domain, &real_box, CoordSys::cartesian, is_per);
+    for (int lev = 1; lev < nlevs; lev++) {
+	geom[lev].define(amrex::refine(geom[lev-1].Domain(), rr[lev-1]),
+			 &real_box, CoordSys::cartesian, is_per);
+    }
+
+    Vector<BoxArray> ba(nlevs);
+    ba[0].define(domain);
+
+    if (nlevs > 1) {
+        int n_fine = parms.nx*rr[0][0];
+        IntVect refined_lo(D_DECL(n_fine/4,n_fine/4,n_fine/4));
+        IntVect refined_hi(D_DECL(3*n_fine/4-1,3*n_fine/4-1,3*n_fine/4-1));
+
+        // Build a box for the level 1 domain
+        Box refined_patch(refined_lo, refined_hi);
+        ba[1].define(refined_patch);
+    }
+
+    // break the BoxArrays at both levels into max_grid_size^3 boxes
+    for (int lev = 0; lev < nlevs; lev++) {
+        ba[lev].maxSize(parms.max_grid_size);
+    }
+
+    Vector<DistributionMapping> dmap(nlevs);
+
+    Vector<std::unique_ptr<MultiFab> > partMF(nlevs);
+    Vector<std::unique_ptr<MultiFab> > density(nlevs);
+    Vector<std::unique_ptr<MultiFab> > acceleration(nlevs);
+    for (int lev = 0; lev < nlevs; lev++) {
+        dmap[lev] = DistributionMapping{ba[lev]};
+        density[lev].reset(new MultiFab(ba[lev], dmap[lev], 1, 0));
+        density[lev]->setVal(0.0);
+        acceleration[lev].reset(new MultiFab(ba[lev], dmap[lev], 3, 1));
+        acceleration[lev]->setVal(5.0, 1);
+    }
+
+    MyParticleContainer myPC(geom, dmap, ba, rr);
+    myPC.SetVerbose(false);
+
+    myPC.InitParticles(IntVect(2, 2, 2));
+
+    for (int step = 0; step < 4000; ++step)
+    {
+        myPC.AssignDensity(0, partMF, 0, 1, nlevs-1);
+
+        for (int lev = 0; lev < nlevs; ++lev) {
+            MultiFab::Copy(*density[lev], *partMF[lev], 0, 0, 1, 0);
+        }
+
+        if (step % 1000 == 0) {
+            Vector<std::string> varnames;
+            varnames.push_back("density");
+
+            Vector<std::string> particle_varnames;
+            particle_varnames.push_back("mass");
+
+            Vector<int> level_steps;
+            level_steps.push_back(0);
+            level_steps.push_back(0);
+
+            int output_levs = nlevs;
+
+            Vector<const MultiFab*> outputMF(output_levs);
+            Vector<IntVect> outputRR(output_levs);
+            for (int lev = 0; lev < output_levs; ++lev) {
+                outputMF[lev] = density[lev].get();
+                outputRR[lev] = IntVect(D_DECL(2, 2, 2));
+            }
+
+            std::string fn = amrex::Concatenate("plt", step, 5);
+
+            WriteMultiLevelPlotfile(fn, output_levs, outputMF,
+                                    varnames, geom, 0.0, level_steps, outputRR);
+
+            myPC.WritePlotFile(fn, "particle0");
+        }
+    }
+}
+
+int main(int argc, char* argv[])
+{
+  amrex::Initialize(argc,argv);
+
+  ParmParse pp;
+
+  TestParams parms;
+
+  pp.get("nx", parms.nx);
+  pp.get("ny", parms.ny);
+  pp.get("nz", parms.nz);
+  pp.get("max_grid_size", parms.max_grid_size);
+  pp.get("nlevs", parms.nlevs);
+  pp.get("nppc", parms.nppc);
+  if (parms.nppc < 1 && ParallelDescriptor::IOProcessor())
+    amrex::Abort("Must specify at least one particle per cell");
+
+  parms.verbose = false;
+  pp.query("verbose", parms.verbose);
+
+  if (parms.verbose && ParallelDescriptor::IOProcessor()) {
+    std::cout << std::endl;
+    std::cout << "Number of particles per cell : ";
+    std::cout << parms.nppc  << std::endl;
+    std::cout << "Size of domain               : ";
+    std::cout << "Num levels: ";
+    std::cout << parms.nlevs << std::endl;
+    std::cout << parms.nx << " " << parms.ny << " " << parms.nz << std::endl;
+  }
+
+  test_async_io(parms);
+
+  amrex::Finalize();
+}