diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index a9a475101c..dca96f4155 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -166,7 +166,7 @@ jobs:
               -DCMAKE_BUILD_TYPE=Release          `
               -DAMReX_GPU_BACKEND=CUDA            `
               -DAMReX_CUDA_ARCH="8.0"             `
-              -DAMReX_ENABLE_TESTS=OFF            `
+              -DAMReX_ENABLE_TESTS=ON             `
               -DAMReX_EB=OFF                      `
               -DAMReX_FFT=OFF                     `
               -DAMReX_LINEAR_SOLVERS=OFF          `
diff --git a/Src/AmrCore/AMReX_FillPatchUtil_I.H b/Src/AmrCore/AMReX_FillPatchUtil_I.H
index 0c4f333935..bf6684bda1 100644
--- a/Src/AmrCore/AMReX_FillPatchUtil_I.H
+++ b/Src/AmrCore/AMReX_FillPatchUtil_I.H
@@ -142,22 +142,14 @@ FillPatchSingleLevel (MF& mf, IntVect const& nghost, Real time,
 
                 if (time == t0)
                 {
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                    amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                     AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
-#endif
                     {
                         dfab(i,j,k,n+destcomp) = sfab0(i,j,k,n+scomp);
                     });
                 }
                 else if (time == t1)
                 {
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                    amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                     AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
-#endif
                     {
                         dfab(i,j,k,n+destcomp) = sfab1(i,j,k,n+scomp);
                     });
@@ -166,11 +158,7 @@ FillPatchSingleLevel (MF& mf, IntVect const& nghost, Real time,
                 {
                     Real alpha = (t1-time)/(t1-t0);
                     Real beta = (time-t0)/(t1-t0);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                    amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                     AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
-#endif
                     {
                         dfab(i,j,k,n+destcomp) = alpha*sfab0(i,j,k,n+scomp)
                             +                     beta*sfab1(i,j,k,n+scomp);
@@ -178,11 +166,7 @@ FillPatchSingleLevel (MF& mf, IntVect const& nghost, Real time,
                 }
                 else
                 {
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                    amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                     AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
-#endif
                     {
                         dfab(i,j,k,n+destcomp) = sfab0(i,j,k,n+scomp);
                     });
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index b3071c7e29..8f69b54a6a 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -43,11 +43,7 @@ TagBox::coarsen (const IntVect& ratio, const Box& cbox) noexcept
     Dim3 r{1,1,1};
     AMREX_D_TERM(r.x = ratio[0];, r.y = ratio[1];, r.z = ratio[2]);
 
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-    amrex::ParallelFor(cbox, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-#else
     AMREX_HOST_DEVICE_FOR_3D(cbox, i, j, k,
-#endif
     {
         TagType t = TagBox::CLEAR;
         for (int koff = 0; koff < r.z; ++koff) {
@@ -87,11 +83,7 @@ TagBox::buffer (const IntVect& a_nbuff, const IntVect& a_nwid) noexcept
         Box const& interiorplusbuf = amrex::grow(interior, a_nbuff);
         const auto lo = amrex::lbound(interiorplusbuf);
         const auto hi = amrex::ubound(interiorplusbuf);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-        amrex::ParallelFor(interiorplusbuf, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-#else
         AMREX_HOST_DEVICE_FOR_3D(interiorplusbuf, i, j, k,
-#endif
         {
             if (a(i,j,k) == TagBox::CLEAR) {
                 bool to_buf = false;
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index 71576bcb8c..e20f8bb4b7 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -422,11 +422,7 @@ FabArray<FAB>::FB_local_copy_cuda_graph_1 (const FB& TheFB, int scomp, int ncomp
             {
                 const auto offset = tag.offset.dim3();
                 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                amrex::ParallelFor(tag.dbox, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-#else
                 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
-#endif
                 {
                     // Build the Array4's.
                     auto const dst = cmem->getDst<value_type>();
@@ -511,11 +507,7 @@ FabArray<FAB>::FB_local_copy_cuda_graph_n (const FB& TheFB, int scomp, int ncomp
                 const Dim3 offset = tag.offset.dim3();
 
                 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                amrex::ParallelFor(tag.dbox, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-#else
                 AMREX_HOST_DEVICE_FOR_3D(tag.dbox, i, j, k,
-#endif
                 {
                     auto const dst = cmem->getDst<value_type>();
                     auto const src = cmem->getSrc<value_type>();
@@ -601,11 +593,7 @@ FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n
                 {
                     const Box& bx = tag.sbox;
                     CopyMemory* cmem = TheFB.m_copyToBuffer.getDevicePtr(idx++);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                    amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int ii, int jj, int kk)
-#else
                     AMREX_HOST_DEVICE_FOR_3D (bx, ii, jj, kk,
-#endif
                     {
                         auto const pfab = cmem->getDst<value_type>();
                         auto const sfab = cmem->getSrc<value_type>();
@@ -697,11 +685,7 @@ FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int
             for (auto const & tag : tags)
             {
                 CopyMemory* cmem = TheFB.m_copyFromBuffer.getDevicePtr(idx++);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                amrex::ParallelFor(tag.dbox, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-#else
                 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
-#endif
                 {
                     auto const pfab = cmem->getSrc<value_type>();
                     auto const dfab = cmem->getDst<value_type>();
diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H
index cdccef8d2a..bfac76c471 100644
--- a/Src/Base/AMReX_FabArrayUtility.H
+++ b/Src/Base/AMReX_FabArrayUtility.H
@@ -1136,11 +1136,7 @@ Subtract (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
             {
                 auto const srcFab = src.array(mfi);
                 auto       dstFab = dst.array(mfi);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                amrex::ParallelFor(bx, numcomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                 AMREX_HOST_DEVICE_PARALLEL_FOR_4D( bx, numcomp, i, j, k, n,
-#endif
                 {
                     dstFab(i,j,k,n+dstcomp) -= srcFab(i,j,k,n+srccomp);
                 });
@@ -1188,11 +1184,7 @@ Multiply (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
             {
                 auto const srcFab = src.array(mfi);
                 auto       dstFab = dst.array(mfi);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                amrex::ParallelFor(bx, numcomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                 AMREX_HOST_DEVICE_PARALLEL_FOR_4D( bx, numcomp, i, j, k, n,
-#endif
                 {
                     dstFab(i,j,k,n+dstcomp) *= srcFab(i,j,k,n+srccomp);
                 });
@@ -1240,11 +1232,7 @@ Divide (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp,
             {
                 auto const srcFab = src.array(mfi);
                 auto       dstFab = dst.array(mfi);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                amrex::ParallelFor(bx, numcomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                 AMREX_HOST_DEVICE_PARALLEL_FOR_4D( bx, numcomp, i, j, k, n,
-#endif
                 {
                     dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp);
                 });
@@ -1289,11 +1277,7 @@ Abs (FabArray<FAB>& fa, int icomp, int numcomp, const IntVect& nghost)
             if (bx.ok())
             {
                 auto const& fab = fa.array(mfi);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                amrex::ParallelFor(bx, numcomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                 AMREX_HOST_DEVICE_PARALLEL_FOR_4D( bx, numcomp, i, j, k, n,
-#endif
                 {
                     fab(i,j,k,n+icomp) = std::abs(fab(i,j,k,n+icomp));
                 });
@@ -1380,11 +1364,7 @@ OverrideSync_nowait (FabArray<FAB> & fa, FabArray<IFAB> const& msk, const Period
             const Box& bx = mfi.tilebox();
             auto fab = fa.array(mfi);
             auto const ifab = msk.array(mfi);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-            amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
             AMREX_HOST_DEVICE_PARALLEL_FOR_4D( bx, ncomp, i, j, k, n,
-#endif
             {
                 if (!ifab(i,j,k)) { fab(i,j,k,n) = 0; }
             });
diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H
index e1eaa49e23..8c14adb24f 100644
--- a/Src/Base/AMReX_MultiFabUtil.H
+++ b/Src/Base/AMReX_MultiFabUtil.H
@@ -251,11 +251,7 @@ namespace amrex
             const Long n = mfi.fabbox().numPts() * mf_in.nComp();
             auto      * pdst = mf_out[mfi].dataPtr();
             auto const* psrc = mf_in [mfi].dataPtr();
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-            amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (Long i)
-#else
             AMREX_HOST_DEVICE_PARALLEL_FOR_1D ( n, i,
-#endif
             {
                 pdst[i] = static_cast<typename U::value_type>(psrc[i]); // NOLINT(bugprone-signed-char-misuse)
             });
@@ -580,20 +576,12 @@ void average_down (const FabArray<FAB>& S_fine, FabArray<FAB>& S_crse,
                 Array4<value_type const> const& finearr = S_fine.const_array(mfi);
 
                 if (is_cell_centered) {
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                    amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                     AMREX_HOST_DEVICE_PARALLEL_FOR_4D(bx, ncomp, i, j, k, n,
-#endif
                                                       {
                                                           amrex_avgdown(i,j,k,n,crsearr,finearr,scomp,scomp,ratio);
                                                       });
                 } else {
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                    amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                     AMREX_HOST_DEVICE_PARALLEL_FOR_4D(bx, ncomp, i, j, k, n,
-#endif
                                                       {
                                                           amrex_avgdown_nodes(i,j,k,n,crsearr,finearr,scomp,scomp,ratio);
                                                       });
@@ -643,20 +631,12 @@ void average_down (const FabArray<FAB>& S_fine, FabArray<FAB>& S_crse,
                 //        not part of the actual crse multifab which came in.
 
                 if (is_cell_centered) {
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                    amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                     AMREX_HOST_DEVICE_PARALLEL_FOR_4D(bx, ncomp, i, j, k, n,
-#endif
                                                       {
                                                           amrex_avgdown(i,j,k,n,crsearr,finearr,0,scomp,ratio);
                                                       });
                 } else {
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                    amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                     AMREX_HOST_DEVICE_PARALLEL_FOR_4D(bx, ncomp, i, j, k, n,
-#endif
                                                       {
                                                           amrex_avgdown_nodes(i,j,k,n,crsearr,finearr,0,scomp,ratio);
                                                       });
@@ -851,22 +831,14 @@ void average_face_to_cellcenter (CMF& cc, int dcomp,
                          auto const& fzarr = fc[2]->const_array(mfi););
 
 #if (AMREX_SPACEDIM == 1)
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-#else
             AMREX_HOST_DEVICE_PARALLEL_FOR_3D( bx, i, j, k,
-#endif
             {
                 GeometryData gd;
                 gd.coord = 0;
                 amrex_avg_fc_to_cc(i,j,k, ccarr, fxarr, dcomp, gd);
             });
-#else
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
 #else
             AMREX_HOST_DEVICE_PARALLEL_FOR_3D( bx, i, j, k,
-#endif
             {
                 amrex_avg_fc_to_cc(i,j,k, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), dcomp);
             });
@@ -957,11 +929,7 @@ void average_down_faces (const FabArray<FAB>& fine, FabArray<FAB>& crse,
                 const Box& bx = mfi.growntilebox(ngcrse);
                 auto const& crsearr = crse.array(mfi);
                 auto const& finearr = fine.const_array(mfi);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-                amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
                 AMREX_HOST_DEVICE_PARALLEL_FOR_4D(bx, ncomp, i, j, k, n,
-#endif
                 {
                     amrex_avgdown_faces(i,j,k,n, crsearr, finearr, 0, 0, ratio, dir);
                 });
diff --git a/Src/Base/AMReX_NonLocalBCImpl.H b/Src/Base/AMReX_NonLocalBCImpl.H
index 9adf95c0c8..d2bf545eab 100644
--- a/Src/Base/AMReX_NonLocalBCImpl.H
+++ b/Src/Base/AMReX_NonLocalBCImpl.H
@@ -550,11 +550,7 @@ Rotate90 (FabArray<FAB>& mf, int scomp, int ncomp, IntVect const& nghost, Box co
         Box const& bx = corner & mfi.fabbox();
         if (bx.ok()) {
             auto const& fab = mf.array(mfi);
-#if defined(AMREX_USE_GPU) && defined(_WIN32)
-            amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
-#else
             AMREX_HOST_DEVICE_PARALLEL_FOR_4D(bx,ncomp,i,j,k,n,
-#endif
             {
                 fab(i,j,k,n) = fab(-i-1,-j-1,k,n);
             });