From 65aeea5f8a64dadf907e00bd5a6e5a7e2494e53f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Tue, 8 Sep 2020 09:33:08 +0200
Subject: [PATCH 01/13] fix usage of `::abs()`

Use cupla math implementation of `abs()` instead of `abs` from the
global scope.
This change solves compile issues with HIP, found in my HIP prototype
branch.
---
 include/picongpu/plugins/output/images/Visualisation.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/picongpu/plugins/output/images/Visualisation.hpp b/include/picongpu/plugins/output/images/Visualisation.hpp
index 194233170c..969fd80832 100644
--- a/include/picongpu/plugins/output/images/Visualisation.hpp
+++ b/include/picongpu/plugins/output/images/Visualisation.hpp
@@ -156,7 +156,7 @@ struct typicalFields < 5 >
         const float_X tyEField = fields::laserProfiles::Selected::Unitless::W0 * BASE_DENSITY / 3.0f / EPS0;
         const float_X tyBField = tyEField * MUE0_EPS0;
         const float_X tyCurrent = particles::TYPICAL_PARTICLES_PER_CELL * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE
-            * abs(baseCharge) / DELTA_T;
+            * math::abs(baseCharge) / DELTA_T;
 
         return float3_X(tyBField, tyEField, tyCurrent);
 #endif

From 1d3b24d05a37786502ec1d4dbb868769c1830657 Mon Sep 17 00:00:00 2001
From: Sergei Bastrakov <sergey.bastrakov@gmail.com>
Date: Mon, 14 Sep 2020 10:35:22 +0200
Subject: [PATCH 02/13] Clarify output concerning cuda_memtest not being
 available

solves #3327
---
 etc/picongpu/aris-grnet/gpu.tpl          | 2 +-
 etc/picongpu/bash/mpiexec.tpl            | 2 +-
 etc/picongpu/bash/mpirun.tpl             | 2 +-
 etc/picongpu/davide-cineca/gpu.tpl       | 2 +-
 etc/picongpu/davinci-rice/picongpu.tpl   | 2 +-
 etc/picongpu/hemera-hzdr/fwkt_v100.tpl   | 2 +-
 etc/picongpu/hemera-hzdr/gpu.tpl         | 2 +-
 etc/picongpu/hemera-hzdr/k20.tpl         | 2 +-
 etc/picongpu/hemera-hzdr/k20_restart.tpl | 2 +-
 etc/picongpu/hemera-hzdr/k80.tpl         | 2 +-
 etc/picongpu/hemera-hzdr/k80_restart.tpl | 2 +-
 etc/picongpu/jureca-jsc/gpus.tpl         | 2 +-
 etc/picongpu/juwels-jsc/gpus.tpl         | 2 +-
 etc/picongpu/lawrencium-lbnl/fermi.tpl   | 2 +-
 etc/picongpu/lawrencium-lbnl/k20.tpl     | 2 +-
 etc/picongpu/pizdaint-cscs/large.tpl     | 2 +-
 etc/picongpu/pizdaint-cscs/normal.tpl    | 2 +-
 etc/picongpu/taurus-tud/V100.tpl         | 2 +-
 etc/picongpu/taurus-tud/V100_restart.tpl | 2 +-
 etc/picongpu/taurus-tud/k20x.tpl         | 2 +-
 etc/picongpu/taurus-tud/k80.tpl          | 2 +-
 21 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/etc/picongpu/aris-grnet/gpu.tpl b/etc/picongpu/aris-grnet/gpu.tpl
index 440a2080b8..05ec169f0b 100644
--- a/etc/picongpu/aris-grnet/gpu.tpl
+++ b/etc/picongpu/aris-grnet/gpu.tpl
@@ -102,7 +102,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   srun -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/bash/mpiexec.tpl b/etc/picongpu/bash/mpiexec.tpl
index 04f509cb57..c4f58c67d8 100644
--- a/etc/picongpu/bash/mpiexec.tpl
+++ b/etc/picongpu/bash/mpiexec.tpl
@@ -53,7 +53,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   mpiexec -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/bash/mpirun.tpl b/etc/picongpu/bash/mpirun.tpl
index fb6e760cd1..f9c07cdc62 100644
--- a/etc/picongpu/bash/mpirun.tpl
+++ b/etc/picongpu/bash/mpirun.tpl
@@ -53,7 +53,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   mpirun -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/davide-cineca/gpu.tpl b/etc/picongpu/davide-cineca/gpu.tpl
index f4c0f50ee6..0ccc52f254 100644
--- a/etc/picongpu/davide-cineca/gpu.tpl
+++ b/etc/picongpu/davide-cineca/gpu.tpl
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   srun --cpu-bind=sockets !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/davinci-rice/picongpu.tpl b/etc/picongpu/davinci-rice/picongpu.tpl
index b4e316b2bd..765687fdab 100644
--- a/etc/picongpu/davinci-rice/picongpu.tpl
+++ b/etc/picongpu/davinci-rice/picongpu.tpl
@@ -81,7 +81,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   mpirun -n TBG_tasks --display-map -am tbg/openib.conf --mca mpi_leave_pinned 0 !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/fwkt_v100.tpl b/etc/picongpu/hemera-hzdr/fwkt_v100.tpl
index 5bc8341d9b..09058913a0 100644
--- a/etc/picongpu/hemera-hzdr/fwkt_v100.tpl
+++ b/etc/picongpu/hemera-hzdr/fwkt_v100.tpl
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/gpu.tpl b/etc/picongpu/hemera-hzdr/gpu.tpl
index 90c9b2d12c..a06c278306 100644
--- a/etc/picongpu/hemera-hzdr/gpu.tpl
+++ b/etc/picongpu/hemera-hzdr/gpu.tpl
@@ -101,7 +101,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/k20.tpl b/etc/picongpu/hemera-hzdr/k20.tpl
index b8992555f0..f58454e6f9 100644
--- a/etc/picongpu/hemera-hzdr/k20.tpl
+++ b/etc/picongpu/hemera-hzdr/k20.tpl
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/k20_restart.tpl b/etc/picongpu/hemera-hzdr/k20_restart.tpl
index 52b9701b07..d531615ccb 100644
--- a/etc/picongpu/hemera-hzdr/k20_restart.tpl
+++ b/etc/picongpu/hemera-hzdr/k20_restart.tpl
@@ -167,7 +167,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq !TBG_gpusPerNode ] ; then
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/k80.tpl b/etc/picongpu/hemera-hzdr/k80.tpl
index 3cfc81ea4d..9c52c96b38 100644
--- a/etc/picongpu/hemera-hzdr/k80.tpl
+++ b/etc/picongpu/hemera-hzdr/k80.tpl
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/k80_restart.tpl b/etc/picongpu/hemera-hzdr/k80_restart.tpl
index d65f9e9730..4a74804505 100644
--- a/etc/picongpu/hemera-hzdr/k80_restart.tpl
+++ b/etc/picongpu/hemera-hzdr/k80_restart.tpl
@@ -167,7 +167,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq !TBG_gpusPerNode ] ; then
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/jureca-jsc/gpus.tpl b/etc/picongpu/jureca-jsc/gpus.tpl
index a6bda9a5e2..182bf90225 100644
--- a/etc/picongpu/jureca-jsc/gpus.tpl
+++ b/etc/picongpu/jureca-jsc/gpus.tpl
@@ -93,7 +93,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedDevicesPerNode
   # Run CUDA memtest to check GPU's health
   srun --cpu_bind=sockets !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/juwels-jsc/gpus.tpl b/etc/picongpu/juwels-jsc/gpus.tpl
index 6d8c717931..c90d613289 100644
--- a/etc/picongpu/juwels-jsc/gpus.tpl
+++ b/etc/picongpu/juwels-jsc/gpus.tpl
@@ -93,7 +93,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedDevicesPerNode
   # Run CUDA memtest to check GPU's health
   srun --cpu_bind=sockets !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/lawrencium-lbnl/fermi.tpl b/etc/picongpu/lawrencium-lbnl/fermi.tpl
index 796562c1b2..16bb7e525c 100644
--- a/etc/picongpu/lawrencium-lbnl/fermi.tpl
+++ b/etc/picongpu/lawrencium-lbnl/fermi.tpl
@@ -106,7 +106,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   mpirun !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/lawrencium-lbnl/k20.tpl b/etc/picongpu/lawrencium-lbnl/k20.tpl
index 76518fbd1a..a7be12dd2a 100644
--- a/etc/picongpu/lawrencium-lbnl/k20.tpl
+++ b/etc/picongpu/lawrencium-lbnl/k20.tpl
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   mpirun !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/pizdaint-cscs/large.tpl b/etc/picongpu/pizdaint-cscs/large.tpl
index 41c87ffa7b..6e6501384d 100644
--- a/etc/picongpu/pizdaint-cscs/large.tpl
+++ b/etc/picongpu/pizdaint-cscs/large.tpl
@@ -82,7 +82,7 @@ ln -s ../stdout output
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   srun  -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/pizdaint-cscs/normal.tpl b/etc/picongpu/pizdaint-cscs/normal.tpl
index 5de12231bf..abcbf7f607 100644
--- a/etc/picongpu/pizdaint-cscs/normal.tpl
+++ b/etc/picongpu/pizdaint-cscs/normal.tpl
@@ -87,7 +87,7 @@ export PMI_NO_PREINITIALIZE=1
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   srun  -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/taurus-tud/V100.tpl b/etc/picongpu/taurus-tud/V100.tpl
index 8f6dbbf922..f678e75c8d 100644
--- a/etc/picongpu/taurus-tud/V100.tpl
+++ b/etc/picongpu/taurus-tud/V100.tpl
@@ -107,7 +107,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   srun -K1 !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/taurus-tud/V100_restart.tpl b/etc/picongpu/taurus-tud/V100_restart.tpl
index 8e34ff0d1b..d1f07c337f 100644
--- a/etc/picongpu/taurus-tud/V100_restart.tpl
+++ b/etc/picongpu/taurus-tud/V100_restart.tpl
@@ -175,7 +175,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   mpiexec -hostfile ../machinefile.txt !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/taurus-tud/k20x.tpl b/etc/picongpu/taurus-tud/k20x.tpl
index d7be22efe9..17b34581f1 100644
--- a/etc/picongpu/taurus-tud/k20x.tpl
+++ b/etc/picongpu/taurus-tud/k20x.tpl
@@ -97,7 +97,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   srun -K1 !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/taurus-tud/k80.tpl b/etc/picongpu/taurus-tud/k80.tpl
index 279bdc9e43..80480f5a6c 100644
--- a/etc/picongpu/taurus-tud/k80.tpl
+++ b/etc/picongpu/taurus-tud/k80.tpl
@@ -97,7 +97,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   srun -K1 !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then

From 700c248b4528c469236fd94e96525b5e60bf76f7 Mon Sep 17 00:00:00 2001
From: Sergei Bastrakov <sergey.bastrakov@gmail.com>
Date: Wed, 16 Sep 2020 15:14:39 +0200
Subject: [PATCH 03/13] Remove leftover mentions of pmacc::nvidia::rng with no
 actual use

Fix outdated comments
---
 include/picongpu/particles/Particles.kernel                   | 4 ----
 include/picongpu/particles/filter/generic/FreeRng.hpp         | 2 --
 include/picongpu/particles/functor/misc/Rng.hpp               | 4 ----
 include/picongpu/particles/manipulators/generic/FreeRng.def   | 2 +-
 include/picongpu/particles/manipulators/generic/FreeRng.hpp   | 2 --
 .../picongpu/particles/manipulators/unary/RandomPosition.def  | 2 +-
 include/picongpu/particles/manipulators/unary/Temperature.hpp | 2 +-
 include/picongpu/particles/startPosition/RandomImpl.hpp       | 4 +---
 include/picongpu/particles/startPosition/generic/FreeRng.hpp  | 2 --
 .../examples/FoilLCT/include/picongpu/param/particle.param    | 1 -
 10 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/include/picongpu/particles/Particles.kernel b/include/picongpu/particles/Particles.kernel
index 93b0dcd09e..538189946f 100644
--- a/include/picongpu/particles/Particles.kernel
+++ b/include/picongpu/particles/Particles.kernel
@@ -35,10 +35,6 @@
 #include <pmacc/nvidia/functors/Assign.hpp>
 #include <pmacc/mappings/threads/ThreadCollective.hpp>
 
-#include <pmacc/nvidia/rng/RNG.hpp>
-#include <pmacc/nvidia/rng/methods/Xor.hpp>
-#include <pmacc/nvidia/rng/distributions/Normal_float.hpp>
-
 #include <pmacc/particles/operations/Assign.hpp>
 #include <pmacc/particles/operations/Deselect.hpp>
 #include <pmacc/nvidia/atomic.hpp>
diff --git a/include/picongpu/particles/filter/generic/FreeRng.hpp b/include/picongpu/particles/filter/generic/FreeRng.hpp
index 2667644f29..dc701ebd5b 100644
--- a/include/picongpu/particles/filter/generic/FreeRng.hpp
+++ b/include/picongpu/particles/filter/generic/FreeRng.hpp
@@ -78,8 +78,6 @@ namespace acc
             T_Particle const & particle
         )
         {
-            namespace nvrng = nvidia::rng;
-
             bool const isValid = particle.isHandleValid( );
 
             return isValid && Functor::operator()(
diff --git a/include/picongpu/particles/functor/misc/Rng.hpp b/include/picongpu/particles/functor/misc/Rng.hpp
index c141fb1ddc..0b40b1cf5b 100644
--- a/include/picongpu/particles/functor/misc/Rng.hpp
+++ b/include/picongpu/particles/functor/misc/Rng.hpp
@@ -22,8 +22,6 @@
 #include "picongpu/simulation_defines.hpp"
 #include "picongpu/particles/functor/misc/RngWrapper.hpp"
 
-#include <pmacc/nvidia/rng/RNG.hpp>
-#include <pmacc/nvidia/rng/methods/Xor.hpp>
 #include <pmacc/mpi/SeedPerRank.hpp>
 #include <pmacc/traits/GetUniqueTypeId.hpp>
 #include <pmacc/random/methods/methods.hpp>
@@ -93,8 +91,6 @@ namespace misc
             T_WorkerCfg const & workerCfg
         ) const
         {
-            namespace nvrng = nvidia::rng;
-
             RngHandle tmp( rngHandle );
             tmp.init(
                 localSupercellOffset * SuperCellSize::toRT() +
diff --git a/include/picongpu/particles/manipulators/generic/FreeRng.def b/include/picongpu/particles/manipulators/generic/FreeRng.def
index e11c8c2b4e..b82907f831 100644
--- a/include/picongpu/particles/manipulators/generic/FreeRng.def
+++ b/include/picongpu/particles/manipulators/generic/FreeRng.def
@@ -42,7 +42,7 @@ namespace generic
      *
      * example for `particle.param`: add
      *   @code{.cpp}
-     *   #include <pmacc/nvidia/rng/distributions/Uniform_float.hpp>
+     *   #include <pmacc/random/distributions/Uniform.hpp>
      *
      *   struct FunctorRandomX
      *   {
diff --git a/include/picongpu/particles/manipulators/generic/FreeRng.hpp b/include/picongpu/particles/manipulators/generic/FreeRng.hpp
index adf5954b43..6bca53a296 100644
--- a/include/picongpu/particles/manipulators/generic/FreeRng.hpp
+++ b/include/picongpu/particles/manipulators/generic/FreeRng.hpp
@@ -81,8 +81,6 @@ namespace acc
             T_Args && ... args
         )
         {
-            namespace nvrng = nvidia::rng;
-
             Functor::operator()(
                 m_rng,
                 particle,
diff --git a/include/picongpu/particles/manipulators/unary/RandomPosition.def b/include/picongpu/particles/manipulators/unary/RandomPosition.def
index 91868f5047..0186f419ba 100644
--- a/include/picongpu/particles/manipulators/unary/RandomPosition.def
+++ b/include/picongpu/particles/manipulators/unary/RandomPosition.def
@@ -46,7 +46,7 @@ namespace acc
     {
         /** set in-cell position
          *
-         * @tparam T_Rng pmacc::nvidia::rng::RNG, type of the random number generator
+         * @tparam T_Rng functor::misc::RngWrapper, type of the random number generator
          * @tparam T_Particle pmacc::Particle, particle type
          * @tparam T_Args pmacc::Particle, arbitrary number of particles types
          *
diff --git a/include/picongpu/particles/manipulators/unary/Temperature.hpp b/include/picongpu/particles/manipulators/unary/Temperature.hpp
index 6d2c0a8fe4..4eff7cb3f1 100644
--- a/include/picongpu/particles/manipulators/unary/Temperature.hpp
+++ b/include/picongpu/particles/manipulators/unary/Temperature.hpp
@@ -48,7 +48,7 @@ namespace acc
     {
         /** manipulate the speed of the particle
          *
-         * @tparam T_Rng pmacc::nvidia::rng::RNG, type of the random number generator
+         * @tparam T_Rng functor::misc::RngWrapper, type of the random number generator
          * @tparam T_Particle pmacc::Particle, particle type
          * @tparam T_Args pmacc::Particle, arbitrary number of particles types
          *
diff --git a/include/picongpu/particles/startPosition/RandomImpl.hpp b/include/picongpu/particles/startPosition/RandomImpl.hpp
index 4727d54aee..75d409b907 100644
--- a/include/picongpu/particles/startPosition/RandomImpl.hpp
+++ b/include/picongpu/particles/startPosition/RandomImpl.hpp
@@ -24,8 +24,6 @@
 #include "picongpu/particles/startPosition/generic/FreeRng.def"
 #include "picongpu/particles/startPosition/detail/WeightMacroParticles.hpp"
 
-#include <pmacc/nvidia/rng/distributions/Uniform_float.hpp>
-
 #include <boost/mpl/integral_c.hpp>
 
 
@@ -43,7 +41,7 @@ namespace acc
     {
         /** set in-cell position and weighting
          *
-         * @tparam T_Rng pmacc::nvidia::rng::RNG, type of the random number generator
+         * @tparam T_Rng functor::misc::RngWrapper, type of the random number generator
          * @tparam T_Particle pmacc::Particle, particle type
          * @tparam T_Args pmacc::Particle, arbitrary number of particles types
          *
diff --git a/include/picongpu/particles/startPosition/generic/FreeRng.hpp b/include/picongpu/particles/startPosition/generic/FreeRng.hpp
index c7a37b952d..cdc358e55c 100644
--- a/include/picongpu/particles/startPosition/generic/FreeRng.hpp
+++ b/include/picongpu/particles/startPosition/generic/FreeRng.hpp
@@ -80,8 +80,6 @@ namespace acc
             T_Args && ... args
         )
         {
-            namespace nvrng = nvidia::rng;
-
             Functor::operator()(
                 m_rng,
                 particle,
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param
index 8ac1a6bc87..b57d4aad56 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param
@@ -34,7 +34,6 @@
 
 #include <pmacc/nvidia/functors/Add.hpp>
 #include <pmacc/nvidia/functors/Assign.hpp>
-#include <pmacc/nvidia/rng/distributions/Uniform_float.hpp>
 
 
 namespace picongpu

From 5c59aba005621c7f2e00dc0f34583678c292bb18 Mon Sep 17 00:00:00 2001
From: Sergei Bastrakov <sergey.bastrakov@gmail.com>
Date: Wed, 16 Sep 2020 15:15:11 +0200
Subject: [PATCH 04/13] Change the game of life example to use pmacc::random
 tools instead of pmacc::nvidia::rng

---
 .../gameOfLife2D/include/Evolution.hpp        | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp
index 5e3affa0ea..2fd631723e 100644
--- a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp
+++ b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp
@@ -28,9 +28,9 @@
 #include <pmacc/memory/dataTypes/Mask.hpp>
 #include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/dimensions/DataSpaceOperations.hpp>
-#include <pmacc/nvidia/rng/RNG.hpp>
-#include <pmacc/nvidia/rng/methods/Xor.hpp>
-#include <pmacc/nvidia/rng/distributions/Uniform_float.hpp>
+#include <pmacc/random/distributions/distributions.hpp>
+#include <pmacc/random/methods/methods.hpp>
+#include <pmacc/random/Random.hpp>
 #include <pmacc/traits/GetNumWorkers.hpp>
 #include <pmacc/mappings/threads/ForEachIdx.hpp>
 #include <pmacc/mappings/threads/IdxConfig.hpp>
@@ -199,11 +199,15 @@ namespace kernel
                 blockCell + DataSpaceOperations< DIM2 >::template map< SuperCellSize >( workerIdx )
             );
 
-            // get uniform random number from seed
-            auto rng = nvidia::rng::create(
-                nvidia::rng::methods::Xor< T_Acc >( acc, seed, globalUniqueId ),
-                nvidia::rng::distributions::Uniform_float::get( acc )
-            );
+            // create a random number state and generator
+            using RngMethod = random::methods::XorMin< T_Acc >;
+            using State = typename RngMethod::StateType;
+            State state;
+            RngMethod method;
+            method.init( acc, state, seed, globalUniqueId );
+            using Distribution = random::distributions::Uniform< float, RngMethod >;
+            using Random = random::Random< Distribution, RngMethod, State* >;
+            Random rng( &state );
 
             ForEachIdx<
                 IdxConfig<
@@ -219,7 +223,7 @@ namespace kernel
                     // cell index within the superCell
                     DataSpace< DIM2 > const cellIdx = DataSpaceOperations< DIM2 >::template map< SuperCellSize >( linearIdx );
                     // write 1(white) if uniform random number 0<rng<1 is smaller than 'threshold'
-                    buffWrite( blockCell + cellIdx ) = static_cast< bool >( rng() <= threshold );
+                    buffWrite( blockCell + cellIdx ) = static_cast< bool >( rng( acc ) <= threshold );
                 }
             );
         }

From f2c32136db7028388f3a2aba34bd30b5dbec6394 Mon Sep 17 00:00:00 2001
From: Sergei Bastrakov <sergey.bastrakov@gmail.com>
Date: Wed, 16 Sep 2020 14:38:09 +0200
Subject: [PATCH 05/13] Remove pmacc/nvidia/rng/* as all its contents are never
 used

These were outdated tools, with pmacc::random providing the modern counterparts
---
 include/pmacc/nvidia/rng/RNG.hpp              | 94 -------------------
 .../nvidia/rng/distributions/Normal_float.hpp | 80 ----------------
 .../rng/distributions/Uniform_float.hpp       | 93 ------------------
 .../rng/distributions/Uniform_int32.hpp       | 83 ----------------
 include/pmacc/nvidia/rng/methods/Xor.hpp      | 75 ---------------
 5 files changed, 425 deletions(-)
 delete mode 100644 include/pmacc/nvidia/rng/RNG.hpp
 delete mode 100644 include/pmacc/nvidia/rng/distributions/Normal_float.hpp
 delete mode 100644 include/pmacc/nvidia/rng/distributions/Uniform_float.hpp
 delete mode 100644 include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp
 delete mode 100644 include/pmacc/nvidia/rng/methods/Xor.hpp

diff --git a/include/pmacc/nvidia/rng/RNG.hpp b/include/pmacc/nvidia/rng/RNG.hpp
deleted file mode 100644
index 2a47f857c9..0000000000
--- a/include/pmacc/nvidia/rng/RNG.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-
-    /* create a random number generator on gpu
-     * \tparam RngMethod method to generate random number
-     * \tparam Distribution functor for distribution
-     */
-    template<class RNGMethod, class Distribution>
-    class RNG : public RNGMethod
-    {
-    public:
-
-        typedef RNGMethod MethodType;
-        typedef Distribution DistributionType;
-        typedef RNG<RNGMethod, Distribution> This;
-
-        HDINLINE RNG()
-        {
-        }
-
-        /*
-         * \param rngMethod instance of generator
-         * \param distribution instance of distribution functor
-         */
-        DINLINE RNG(const RNGMethod& rng_method, const Distribution& rng_operation) :
-        RNGMethod(rng_method), op(rng_operation)
-        {
-        }
-
-        HDINLINE RNG(const This& other) :
-        RNGMethod(static_cast<RNGMethod>(other)), op(other.op)
-        {
-        }
-
-        /* default method to generate a random number
-         * @return random number
-         */
-        DINLINE typename Distribution::Type operator()()
-        {
-            return this->op(this->getState());
-        }
-
-    private:
-        PMACC_ALIGN(op, Distribution);
-    };
-
-    /* create a random number generator on gpu
-     * \tparam RngMethod method to generate random number
-     * \tparam Distribution functor for distribution
-     *
-     * \param rngMethod instance of generator
-     * \param distribution instance of distribution functor
-     * \return class which can used to generate random numbers
-     */
-    template<class RngMethod, class Distribution>
-    DINLINE typename pmacc::nvidia::rng::RNG<RngMethod, Distribution> create(const RngMethod & rngMethod,
-                                                                             const Distribution & distribution)
-    {
-        return pmacc::nvidia::rng::RNG<RngMethod, Distribution >(rngMethod, distribution);
-    }
-
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc
diff --git a/include/pmacc/nvidia/rng/distributions/Normal_float.hpp b/include/pmacc/nvidia/rng/distributions/Normal_float.hpp
deleted file mode 100644
index 2f09df70a7..0000000000
--- a/include/pmacc/nvidia/rng/distributions/Normal_float.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-namespace distributions
-{
-namespace detail
-{
-    /*Return normally distributed floats with mean 0.0f and standard deviation 1.0f
-     */
-    template< typename T_Acc>
-    class Normal_float
-    {
-    public:
-        typedef float Type;
-    private:
-        using Dist =
-            decltype(
-                ::alpaka::rand::distribution::createNormalReal<Type>(
-                    alpaka::core::declval<T_Acc const &>()));
-        PMACC_ALIGN(dist, Dist);
-    public:
-        HDINLINE Normal_float()
-        {
-        }
-
-        HDINLINE Normal_float(const T_Acc& acc) : dist(::alpaka::rand::distribution::createNormalReal<Type>(acc))
-        {
-        }
-
-        template<class RNGState>
-        DINLINE Type operator()(RNGState& state)
-        {
-            return dist(state);
-        }
-
-    };
-} // namespace detail
-
-    struct Normal_float
-    {
-        template< typename T_Acc>
-        static HDINLINE detail::Normal_float< T_Acc >
-        get( T_Acc const & acc)
-        {
-            return detail::Normal_float< T_Acc >( acc );
-        }
-    };
-} // namespace distributions
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc
diff --git a/include/pmacc/nvidia/rng/distributions/Uniform_float.hpp b/include/pmacc/nvidia/rng/distributions/Uniform_float.hpp
deleted file mode 100644
index b757c9f04a..0000000000
--- a/include/pmacc/nvidia/rng/distributions/Uniform_float.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-namespace distributions
-{
-namespace detail
-{
-    /*create a random float number from [0.0,1.0)
-     */
-    template< typename T_Acc>
-    class Uniform_float
-    {
-    public:
-        typedef float Type;
-    private:
-        using Dist =
-            decltype(
-                ::alpaka::rand::distribution::createUniformReal<Type>(
-                    alpaka::core::declval<T_Acc const &>()));
-        PMACC_ALIGN(dist, Dist);
-    public:
-
-        HDINLINE Uniform_float()
-        {
-        }
-
-        HDINLINE Uniform_float(const T_Acc& acc) : dist(::alpaka::rand::distribution::createUniformReal<Type>(acc))
-        {
-        }
-
-        template<class RNGState>
-        DINLINE Type operator()(RNGState& state)
-        {
-            // (0.f, 1.0f]
-            const Type raw = dist(state);
-
-            /// \warn hack, are is that really ok? I say, yes, since
-            /// it shifts just exactly one number. Axel
-            ///
-            ///   Note: (1.0f - raw) does not work, since
-            ///         nvidia seems to return denormalized
-            ///         floats around 0.f (thats not as they
-            ///         state it out in their documentation)
-            // [0.f, 1.0f)
-            const Type r = raw * static_cast<float>( raw != Type(1.0) );
-            return r;
-        }
-
-    };
-} // namespace detail
-
-    struct Uniform_float
-    {
-        template< typename T_Acc>
-        static HDINLINE detail::Uniform_float< T_Acc >
-        get( T_Acc const & acc)
-        {
-            return detail::Uniform_float< T_Acc >( acc );
-        }
-    };
-} // namespace distributions
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc
diff --git a/include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp b/include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp
deleted file mode 100644
index e0569ecb26..0000000000
--- a/include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-namespace distributions
-{
-namespace detail
-{
-    /*create a 32Bit random int number
-     * Range: [INT_MIN,INT_MAX]
-     */
-    template< typename T_Acc>
-    class Uniform_int32
-    {
-    public:
-        typedef int32_t Type;
-
-    private:
-        typedef uint32_t RngType;
-        using Dist =
-            decltype(
-                ::alpaka::rand::distribution::createUniformUint<RngType>(
-                    alpaka::core::declval<T_Acc const &>()));
-        PMACC_ALIGN(dist, Dist);
-    public:
-        HDINLINE Uniform_int()
-        {
-        }
-
-        HDINLINE Uniform_int(const T_Acc& acc) : dist(::alpaka::rand::distribution::createUniformUint<RngType>(acc))
-        {
-        }
-
-        template<class RNGState>
-        DINLINE Type operator()(RNGState& state)
-        {
-            /*curand create a random 32Bit int value*/
-            return static_cast<Type>(dist(state));
-        }
-    };
-} // namespace detail
-
-    struct Normal_float
-    {
-        template< typename T_Acc>
-        static HDINLINE detail::Uniform_int32< T_Acc >
-        get( T_Acc const & acc)
-        {
-            return detail::Uniform_int32< T_Acc >( acc );
-        }
-    };
-} // namespace distributions
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc
diff --git a/include/pmacc/nvidia/rng/methods/Xor.hpp b/include/pmacc/nvidia/rng/methods/Xor.hpp
deleted file mode 100644
index f48e6c3714..0000000000
--- a/include/pmacc/nvidia/rng/methods/Xor.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-namespace methods
-{
-
-    template< typename T_Acc >
-    class Xor
-    {
-    private:
-         using Gen =
-            decltype(
-                ::alpaka::rand::generator::createDefault(
-                    alpaka::core::declval<T_Acc const &>(),
-                    alpaka::core::declval<uint32_t &>(),
-                    alpaka::core::declval<uint32_t &>()));
-        PMACC_ALIGN(gen, Gen);
-    public:
-        typedef Gen StateType;
-        typedef T_Acc Acc;
-
-        HDINLINE Xor() : gen (0)
-        {
-        }
-
-        DINLINE Xor(const T_Acc& acc, uint32_t seed, uint32_t subsequence = 0)
-        {
-            gen = ::alpaka::rand::generator::createDefault(acc, seed, subsequence);
-        }
-
-        HDINLINE Xor(const Xor& other): gen(other.gen)
-        {
-
-        }
-
-    protected:
-
-        DINLINE StateType& getState()
-        {
-            return gen;
-        }
-    };
-} // namespace methods
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc

From 4e57dcac8e863daf9803fda24b8c2b738626a990 Mon Sep 17 00:00:00 2001
From: Sergei Bastrakov <sergey.bastrakov@gmail.com>
Date: Wed, 16 Sep 2020 16:06:58 +0200
Subject: [PATCH 06/13] Set default values for some PMacc game of life example
 arguments

---
 share/pmacc/examples/gameOfLife2D/main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/share/pmacc/examples/gameOfLife2D/main.cpp b/share/pmacc/examples/gameOfLife2D/main.cpp
index d33dc5abaa..fb9a4e4f10 100644
--- a/share/pmacc/examples/gameOfLife2D/main.cpp
+++ b/share/pmacc/examples/gameOfLife2D/main.cpp
@@ -41,7 +41,7 @@ int main( int argc, char **argv )
 {
     typedef ::gol::Space Space;
 
-    std::vector<uint32_t> devices;  /* will be set by boost program argument option "-d 3 3 3" */
+    std::vector<uint32_t> devices;  /* will be set by boost program argument option "-d 3 3" */
     std::vector<uint32_t> gridSize; /* same but with -g */
     std::vector<uint32_t> periodic;
     uint32_t steps;
@@ -50,8 +50,8 @@ int main( int argc, char **argv )
     po::options_description desc( "Allowed options" );
     desc.add_options( )
             ( "help,h", "produce help message" )
-            ( "steps,s", po::value<uint32_t > ( &steps ), "simulation steps" )
-            ( "rule,r", po::value<std::string > ( &rule ), "simulation rule etc. 23/3" )
+            ( "steps,s", po::value<uint32_t > ( &steps )->default_value( 100 ), "simulation steps" )
+            ( "rule,r", po::value<std::string > ( &rule )->default_value( "23/3" ), "simulation rule" )
             ( "devices,d", po::value<std::vector<uint32_t> > ( &devices )->multitoken( ),
               "number of devices in each dimension (only 1D or 2D). If you use more than "
               "one device in total, you will need to run mpirun with \"mpirun -n "

From 8ae41f0c2ecd0d4858d14ed6fc5db60220024549 Mon Sep 17 00:00:00 2001
From: Sergei Bastrakov <sergey.bastrakov@gmail.com>
Date: Mon, 14 Sep 2020 17:50:57 +0200
Subject: [PATCH 07/13] Add generic utilities to get absorber thickness

These are to be used in the upcoming incident field implementation
---
 .../fields/MaxwellSolver/YeePML/YeePML.hpp    |   3 +-
 include/picongpu/fields/absorber/Absorber.hpp | 119 ++++++++++++++++++
 2 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp
index 5cc787c575..8698a2279c 100644
--- a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp
+++ b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp
@@ -223,7 +223,8 @@ namespace maxwellSolver
                 Thickness globalThickness;
                 for( uint32_t axis = 0u; axis < simDim; axis++ )
                     for( auto direction = 0; direction < 2; direction++ )
-                        globalThickness( axis, direction ) = absorber::numCells[ axis ][ direction ];
+                        globalThickness( axis, direction ) =
+                            absorber::getGlobalThickness()( axis, direction );
                 return globalThickness;
             }
 
diff --git a/include/picongpu/fields/absorber/Absorber.hpp b/include/picongpu/fields/absorber/Absorber.hpp
index 5357459117..75f1b60e42 100644
--- a/include/picongpu/fields/absorber/Absorber.hpp
+++ b/include/picongpu/fields/absorber/Absorber.hpp
@@ -166,6 +166,10 @@ namespace detail
     using Absorber = detail::Absorber< Solver >;
 
     /** Number of absorber cells along each boundary
+     *
+     * Stores the global absorber thickness in case the absorbing boundary
+     * conditions are used along each boundary. Note that in case of periodic
+     * boundaries the corresponding values will be ignored.
      *
      * Is uniform for both PML and exponential damping absorbers.
      * First index: 0 = x, 1 = y, 2 = z.
@@ -178,6 +182,121 @@ namespace detail
         { Absorber::zNegativeNumCells, Absorber::zPositiveNumCells }
     };
 
+    //! Thickness of the absorbing layer
+    class Thickness
+    {
+    public:
+
+        //! Create a zero thickness
+        Thickness()
+        {
+            for( uint32_t axis = 0u; axis < 3u; axis++ )
+                for( uint32_t direction = 0u; direction < 2u; direction++ )
+                    (*this)( axis, direction ) = 0u;
+        }
+
+        /** Get thickness for the given boundary
+         *
+         * @param axis axis, 0 = x, 1 = y, 2 = z
+         * @param direction direction, 0 = negative (min coordinate),
+         *                  1 = positive (max coordinate)
+         */
+        uint32_t operator()(
+            uint32_t const axis,
+            uint32_t const direction
+        ) const
+        {
+            return numCells[ axis ][ direction ];
+        }
+
+        /** Get reference to thickness for the given boundary
+         *
+         * @param axis axis, 0 = x, 1 = y, 2 = z
+         * @param direction direction, 0 = negative (min coordinate),
+         *                  1 = positive (max coordinate)
+         */
+        uint32_t & operator()(
+            uint32_t const axis,
+            uint32_t const direction
+        )
+        {
+            return numCells[ axis ][ direction ];
+        }
+
+    private:
+
+        /** Number of absorber cells along each boundary
+         *
+         * First index: 0 = x, 1 = y, 2 = z.
+         * Second index: 0 = negative (min coordinate), 1 = positive (max coordinate).
+         */
+        uint32_t numCells[ 3 ][ 2 ];
+
+    };
+
+    /** Get absorber thickness in number of cells for the global domain
+     *
+     * This function takes into account which boundaries are periodic and
+     * absorbing.
+     */
+    inline Thickness getGlobalThickness()
+    {
+        Thickness thickness;
+        for( uint32_t axis = 0u; axis < 3u; axis++ )
+            for( uint32_t direction = 0u; direction < 2u; direction++ )
+                thickness( axis, direction ) = numCells[ axis ][ direction ];
+        const DataSpace< DIM3 > isPeriodicBoundary =
+            Environment<simDim>::get().EnvironmentController().getCommunicator().getPeriodic();
+        for( uint32_t axis = 0u; axis < 3u; axis++ )
+            if( isPeriodicBoundary[ axis ] )
+            {
+                thickness( axis, 0 ) = 0u;
+                thickness( axis, 1 ) = 0u;
+            }
+        return thickness;
+    }
+
+    /** Get absorber thickness in number of cells for the current local domain
+     *
+     * This function takes into account the current domain decomposition and
+     * which boundaries are periodic and absorbing.
+     *
+     * Note that unlike getGlobalThickness() result which does not change
+     * throughout the simulation, the local thickness can change. Thus,
+     * the result of this function should not be reused on another time step,
+     * but rather the function called again.
+     */
+    inline Thickness getLocalThickness()
+    {
+        Thickness thickness = getGlobalThickness();
+        auto const numExchanges = NumberOfExchanges< simDim >::value;
+        auto const communicationMask = Environment< simDim >::get( ).GridController( ).getCommunicationMask( );
+        for( uint32_t exchange = 1u; exchange < numExchanges; exchange++ )
+        {
+            /* Here we are only interested in the positive and negative
+             * directions for x, y, z axes and not the "diagonal" ones.
+             * So skip other directions except left, right, top, bottom,
+             * back, front
+             */
+            if( FRONT % exchange != 0 )
+                continue;
+
+            // Transform exchange into a pair of axis and direction
+            uint32_t axis = 0;
+            if( exchange >= BOTTOM && exchange <= TOP )
+                axis = 1;
+            if( exchange >= BACK )
+                axis = 2;
+            uint32_t direction = exchange % 2;
+
+            // No absorber at the borders between two local domains
+            bool hasNeighbour = communicationMask.isSet( exchange );
+            if( hasNeighbour )
+                thickness( axis, direction ) = 0u;
+        }
+        return thickness;
+    }
+
 namespace detail
 {
 

From 8d0af9b9420f47175dfd98b2ff74b84925411095 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Tue, 8 Sep 2020 09:21:48 +0200
Subject: [PATCH 08/13] PMacc warp and atomic function refactor

- introduce support for HIP
- change the way how `atomicAllInc` is implemented if no accelerator is
given
- use alpaka intrinsics and warp functions
---
 include/pmacc/nvidia/atomic.hpp | 317 ++++++++++++++++----------------
 include/pmacc/nvidia/warp.hpp   |  10 +-
 2 files changed, 166 insertions(+), 161 deletions(-)

diff --git a/include/pmacc/nvidia/atomic.hpp b/include/pmacc/nvidia/atomic.hpp
index 1988da4d9a..13dba45f35 100644
--- a/include/pmacc/nvidia/atomic.hpp
+++ b/include/pmacc/nvidia/atomic.hpp
@@ -21,182 +21,179 @@
 
 #pragma once
 
-
 #include "pmacc/types.hpp"
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "pmacc/nvidia/warp.hpp"
-#endif
+#include "pmacc/memory/Array.hpp"
+#include "pmacc/nvidia/warp.hpp"
+
+#include <alpaka/intrinsic/Traits.hpp>
+#include <alpaka/warp/Traits.hpp>
+
 #include <boost/type_traits.hpp>
+
+#include <type_traits>
 #include <climits>
 
 
 namespace pmacc
 {
 namespace nvidia
+{
+namespace detail
 {
 
-    namespace detail {
-
-        template<typename T_Type, bool T_isKepler>
-        struct AtomicAllInc
+    template<typename T_Type, bool T_isKepler>
+    struct AtomicAllInc
+    {
+        template< typename T_Acc, typename T_Hierarchy >
+        HDINLINE T_Type
+        operator()(const T_Acc& acc, T_Type* ptr, const T_Hierarchy& hierarchy)
         {
-            template< typename T_Acc, typename T_Hierarchy >
-            HDINLINE T_Type
-            operator()(const T_Acc& acc, T_Type* ptr, const T_Hierarchy& hierarchy)
-            {
-                return ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc, ptr, T_Type(1), hierarchy);
-            }
+            return ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc, ptr, T_Type(1), hierarchy);
+        }
+    };
+
+#if CUPLA_DEVICE_COMPILE == 1
+   /**
+     * Trait that returns whether an optimized version of AtomicAllInc
+     * exists for Kepler architectures (and up)
+     */
+    template<typename T>
+    struct AtomicAllIncIsOptimized
+    {
+        enum{
+            value = boost::is_same<T,          int>::value ||
+                    boost::is_same<T, unsigned int>::value ||
+                    boost::is_same<T,          long long int>::value ||
+                    boost::is_same<T, unsigned long long int>::value ||
+                    boost::is_same<T, float>::value
         };
-
-#if PMACC_CUDA_ARCH >= 300
-       /**
-         * Trait that returns whether an optimized version of AtomicAllInc
-         * exists for Kepler architectures (and up)
-         */
-        template<typename T>
-        struct AtomicAllIncIsOptimized
+    };
+
+    /**
+     * AtomicAllInc for Kepler and up
+     * Defaults to unoptimized version for unsupported types
+     */
+    template<typename T_Type, bool T_UseOptimized = AtomicAllIncIsOptimized<T_Type>::value>
+    struct AtomicAllIncKepler: public AtomicAllInc<T_Type, false>
+    {};
+
+    /**
+     * Optimized version
+     *
+     * This warp aggregated atomic increment implementation based on nvidia parallel forall example
+     * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
+     * (author: Andrew Adinetz, date: October 1th, 2014)
+     *
+     */
+    template<typename T_Type>
+    struct AtomicAllIncKepler<T_Type, true>
+    {
+        template< typename T_Acc, typename T_Hierarchy >
+        HDINLINE T_Type
+        operator()(const T_Acc& acc,T_Type* ptr, const T_Hierarchy& hierarchy)
         {
-            enum{
-                value = boost::is_same<T,          int>::value ||
-                        boost::is_same<T, unsigned int>::value ||
-                        boost::is_same<T,          long long int>::value ||
-                        boost::is_same<T, unsigned long long int>::value ||
-                        boost::is_same<T, float>::value
-            };
-        };
-
-        /**
-         * AtomicAllInc for Kepler and up
-         * Defaults to unoptimized version for unsupported types
-         */
-        template<typename T_Type, bool T_UseOptimized = AtomicAllIncIsOptimized<T_Type>::value>
-        struct AtomicAllIncKepler: public AtomicAllInc<T_Type, false>
-        {};
-
-        /**
-         * Optimized version
-         *
-         * This warp aggregated atomic increment implementation based on nvidia parallel forall example
-         * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
-         * (author: Andrew Adinetz, date: October 1th, 2014)
+            const auto mask = alpaka::warp::activemask(acc);
+            const auto leader = alpaka::intrinsic::ffs(acc, static_cast<std::make_signed_t<decltype(mask)>>(mask)) - 1;
+
+            T_Type result;
+            const int laneId = getLaneId();
+            /* Get the start value for this warp */
+            if (laneId == leader)
+                result = ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc,ptr, static_cast<T_Type>(alpaka::intrinsic::popcount(acc, mask)), hierarchy);
+            result = warpBroadcast(result, leader);
+            /* Add offset per thread */
+            return result + static_cast<T_Type>(alpaka::intrinsic::popcount(acc, mask & (( static_cast<decltype(mask)>(1u) << laneId) - 1u)));
+
+        }
+    };
+
+    /**
+     * Optimized version for int64.
+     * As CUDA atomicAdd does not support int64 directly we just cast it
+     * and call the uint64 implementation
+     */
+    template<>
+    struct AtomicAllIncKepler<long long int, true>
+    {
+        template< typename T_Acc, typename T_Hierarchy >
+        HDINLINE long long int
+        operator()(const T_Acc& acc, long long int* ptr, const T_Hierarchy&, const T_Hierarchy& hierarchy )
+        {
+            return static_cast<long long int>(
+                    AtomicAllIncKepler<unsigned long long int>()(
+                        acc,
+                        reinterpret_cast<unsigned long long int*>(ptr),
+                        hierarchy
+                    )
+            );
+        }
+    };
+
+    template<typename T_Type>
+    struct AtomicAllInc<T_Type, true>: public AtomicAllIncKepler<T_Type>
+    {};
+#endif // CUPLA_DEVICE_COMPILE == 1
+
+}  // namespace detail
+
+    /** optimized atomic increment
+     *
+     * - only optimized if PTX ISA >=3.0
+     * - this atomic uses warp aggregation to speedup the operation compared to cuda `atomicInc()`
+     * - cuda `atomicAdd()` is used if the compute architecture does not support warp aggregation
+     * - all participate threads must change the same pointer (ptr) else the result is unspecified
+     *
+     * @param ptr pointer to memory (must be the same address for all threads in a block)
+     *
+     */
+    template<typename T, typename T_Acc, typename T_Hierarchy>
+    HDINLINE
+    T atomicAllInc(const T_Acc& acc, T *ptr, const T_Hierarchy& hierarchy)
+    {
+        return detail::AtomicAllInc<T, (PMACC_CUDA_ARCH >= 300 || BOOST_COMP_HIP) >()(acc, ptr, hierarchy);
+    }
+
+    template<typename T>
+    HDINLINE
+    T atomicAllInc(T *ptr)
+    {
+        /* Dirty hack to call an alpaka accelerator based function.
+         * Members of the fakeAcc will be uninitialized and must not be accessed.
          *
+         * The id provider for particles is the only code where atomicAllInc is used without an accelerator.
+         * @todo remove the unsafe faked accelerator
          */
-        template<typename T_Type>
-        struct AtomicAllIncKepler<T_Type, true>
-        {
-            template< typename T_Acc, typename T_Hierarchy >
-            HDINLINE T_Type
-            operator()(const T_Acc& acc,T_Type* ptr, const T_Hierarchy& hierarchy)
-            {
-                /* Get a bitmask with 1 for each thread in the warp, that executes this */
-#if(__CUDACC_VER_MAJOR__ >= 9)
-                const int mask = __activemask();
-#else
-                const int mask = __ballot(1);
+        pmacc::memory::Array<cupla::AccThreadSeq, 1> fakeAcc;
+        return atomicAllInc(fakeAcc[0], ptr, ::alpaka::hierarchy::Grids());
+    }
+
+    /** optimized atomic value exchange
+     *
+     * - only optimized if PTX ISA >=2.0
+     * - this atomic uses warp vote function to speedup the operation
+     *   compared to cuda `atomicExch()`
+     * - cuda `atomicExch()` is used if the compute architecture not supports
+     *   warps vote functions
+     * - all participate threads must change the same
+     *   pointer (ptr) and set the same value, else the
+     *   result is unspecified
+     *
+     * @param ptr pointer to memory (must be the same address for all threads in a block)
+     * @param value new value (must be the same for all threads in a block)
+     */
+    template<typename T_Type, typename T_Acc, typename T_Hierarchy>
+    DINLINE void
+    atomicAllExch(const T_Acc& acc, T_Type* ptr, const T_Type value, const T_Hierarchy& hierarchy)
+    {
+
+        const auto mask = alpaka::warp::activemask(acc);
+        const auto leader = alpaka::intrinsic::ffs(acc, static_cast<std::make_signed_t<decltype(mask)>>(mask)) - 1;
+
+#if CUPLA_DEVICE_COMPILE == 1
+        if (getLaneId() == leader)
 #endif
-                /* select the leader */
-                const int leader = __ffs(mask) - 1;
-                T_Type result;
-                const int laneId = getLaneId();
-                /* Get the start value for this warp */
-                if (laneId == leader)
-                    result = ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc,ptr, static_cast<T_Type>(__popc(mask)), hierarchy);
-                result = warpBroadcast(result, leader);
-                /* Add offset per thread */
-                return result + static_cast<T_Type>(__popc(mask & ((1 << laneId) - 1)));
-            }
-        };
-
-        /**
-         * Optimized version for int64.
-         * As CUDA atomicAdd does not support int64 directly we just cast it
-         * and call the uint64 implementation
-         */
-        template<>
-        struct AtomicAllIncKepler<long long int, true>
-        {
-            template< typename T_Acc, typename T_Hierarchy >
-            HDINLINE long long int
-            operator()(const T_Acc& acc, long long int* ptr, const T_Hierarchy&, const T_Hierarchy& hierarchy )
-            {
-                return static_cast<long long int>(
-                        AtomicAllIncKepler<unsigned long long int>()(
-                            acc,
-                            reinterpret_cast<unsigned long long int*>(ptr),
-                            hierarchy
-                        )
-                );
-            }
-        };
-
-        template<typename T_Type>
-        struct AtomicAllInc<T_Type, true>: public AtomicAllIncKepler<T_Type>
-        {};
-#endif /* PMACC_CUDA_ARCH >= 300 */
-
-    }  // namespace detail
-
-/** optimized atomic increment
- *
- * - only optimized if PTX ISA >=3.0
- * - this atomic uses warp aggregation to speedup the operation compared to cuda `atomicInc()`
- * - cuda `atomicAdd()` is used if the compute architecture does not support warp aggregation
- * - all participate threads must change the same pointer (ptr) else the result is unspecified
- *
- * @param ptr pointer to memory (must be the same address for all threads in a block)
- *
- */
-template<typename T, typename T_Acc, typename T_Hierarchy>
-HDINLINE
-T atomicAllInc(const T_Acc& acc, T *ptr, const T_Hierarchy& hierarchy)
-{
-    return detail::AtomicAllInc<T, (PMACC_CUDA_ARCH >= 300) >()(acc, ptr, hierarchy);
-}
-
-template<typename T>
-HDINLINE
-T atomicAllInc(T *ptr)
-{
-#ifdef __CUDA_ARCH__
-    return atomicAllInc(alpaka::atomic::AtomicUniformCudaHipBuiltIn(), ptr, ::alpaka::hierarchy::Grids());
-#else
-    // assume that we can use the standard library atomics if we are not on gpu
-    return atomicAllInc(alpaka::atomic::AtomicStdLibLock<16>(), ptr, ::alpaka::hierarchy::Grids());
-#endif
-}
-
-/** optimized atomic value exchange
- *
- * - only optimized if PTX ISA >=2.0
- * - this atomic uses warp vote function to speedup the operation
- *   compared to cuda `atomicExch()`
- * - cuda `atomicExch()` is used if the compute architecture not supports
- *   warps vote functions
- * - all participate threads must change the same
- *   pointer (ptr) and set the same value, else the
- *   result is unspecified
- *
- * @param ptr pointer to memory (must be the same address for all threads in a block)
- * @param value new value (must be the same for all threads in a block)
- */
-template<typename T_Type, typename T_Acc, typename T_Hierarchy>
-DINLINE void
-atomicAllExch(const T_Acc& acc, T_Type* ptr, const T_Type value, const T_Hierarchy& hierarchy)
-{
-#if (__CUDA_ARCH__ >= 200)
-#   if(__CUDACC_VER_MAJOR__ >= 9)
-    const int mask = __activemask();
-#   else
-    const int mask = __ballot(1);
-#   endif
-    // select the leader
-    const int leader = __ffs(mask) - 1;
-    // leader does the update
-    if (getLaneId() == leader)
-#endif
-        ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Exch>(acc, ptr, value, hierarchy);
-}
-
+            ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Exch>(acc, ptr, value, hierarchy);
+    }
 
 } //namespace nvidia
 } //namespace pmacc
diff --git a/include/pmacc/nvidia/warp.hpp b/include/pmacc/nvidia/warp.hpp
index 48c58dbad6..da120de01b 100644
--- a/include/pmacc/nvidia/warp.hpp
+++ b/include/pmacc/nvidia/warp.hpp
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
 
 #include "pmacc/types.hpp"
 
@@ -42,10 +43,15 @@ DINLINE uint32_t getLaneId()
     asm("mov.u32 %0, %%laneid;" : "=r" (id));
     return id;
 }
+#elif BOOST_COMP_HIP
+DINLINE uint32_t getLaneId()
+{
+    return __lane_id();
+}
 #endif
 
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDA_ARCH__ >= 300 || BOOST_COMP_HIP)
 /** broadcast data within a warp
  *
  * required PTX ISA >=3.0
@@ -114,3 +120,5 @@ DINLINE double warpBroadcast(double data, const int32_t srcLaneId)
 
 } //namespace nvidia
 } //namespace pmacc
+
+#endif

From e47ea6121d0bc0a9ad320f6217f01aa7ce369d57 Mon Sep 17 00:00:00 2001
From: Sergei Bastrakov <sergey.bastrakov@gmail.com>
Date: Thu, 17 Sep 2020 09:42:25 +0200
Subject: [PATCH 09/13] Replace usage of pmacc::memory::makeUnique with C++14
 std::make_unique

---
 include/picongpu/fields/EMFieldBase.tpp             |  3 +--
 include/picongpu/fields/FieldJ.tpp                  |  3 +--
 include/picongpu/fields/FieldTmp.tpp                |  5 ++---
 include/picongpu/particles/ParticlesFunctors.hpp    |  3 +--
 include/picongpu/particles/flylite/NonLTE.tpp       | 13 +++++--------
 .../plugins/xrayScattering/XrayScattering.hpp       |  7 +++----
 .../plugins/xrayScattering/XrayScatteringWriter.hpp |  4 ++--
 .../picongpu/simulation/control/MySimulation.hpp    | 13 ++++++-------
 include/pmacc/memory/buffers/ExchangeIntern.hpp     | 13 ++++++-------
 .../examples/gameOfLife2D/include/Evolution.hpp     |  3 +--
 10 files changed, 28 insertions(+), 39 deletions(-)

diff --git a/include/picongpu/fields/EMFieldBase.tpp b/include/picongpu/fields/EMFieldBase.tpp
index 1371951691..9c3f56aa44 100644
--- a/include/picongpu/fields/EMFieldBase.tpp
+++ b/include/picongpu/fields/EMFieldBase.tpp
@@ -35,7 +35,6 @@
 #include <pmacc/mappings/kernel/ExchangeMapping.hpp>
 #include <pmacc/math/Vector.hpp>
 #include <pmacc/memory/buffers/GridBuffer.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/particles/traits/FilterByFlag.hpp>
 
 #include <boost/mpl/accumulate.hpp>
@@ -58,7 +57,7 @@ namespace fields
         SimulationFieldHelper< MappingDesc >( cellDescription ),
         id( id )
     {
-        buffer = pmacc::memory::makeUnique< Buffer >(
+        buffer = std::make_unique< Buffer >(
             cellDescription.getGridLayout( )
         );
 
diff --git a/include/picongpu/fields/FieldJ.tpp b/include/picongpu/fields/FieldJ.tpp
index 20272f1537..d3ec7f6453 100644
--- a/include/picongpu/fields/FieldJ.tpp
+++ b/include/picongpu/fields/FieldJ.tpp
@@ -33,7 +33,6 @@
 #include <pmacc/mappings/kernel/AreaMapping.hpp>
 #include <pmacc/fields/tasks/FieldFactory.hpp>
 #include <pmacc/math/Vector.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/fields/operations/CopyGuardToExchange.hpp>
 #include <pmacc/fields/operations/AddExchangeToBorder.hpp>
 #include <pmacc/traits/Resolve.hpp>
@@ -127,7 +126,7 @@ FieldJ::FieldJ( MappingDesc const & cellDescription ) :
     if( originRecvGuard != DataSpace<simDim>::create(0) ||
         endRecvGuard != DataSpace<simDim>::create(0) )
     {
-        fieldJrecv = pmacc::memory::makeUnique< GridBuffer<ValueType, simDim > >(
+        fieldJrecv = std::make_unique< GridBuffer<ValueType, simDim > >(
             buffer.getDeviceBuffer(),
             cellDescription.getGridLayout( )
         );
diff --git a/include/picongpu/fields/FieldTmp.tpp b/include/picongpu/fields/FieldTmp.tpp
index a6f3d718d6..12d28976c7 100644
--- a/include/picongpu/fields/FieldTmp.tpp
+++ b/include/picongpu/fields/FieldTmp.tpp
@@ -27,7 +27,6 @@
 #include "picongpu/particles/traits/GetInterpolation.hpp"
 
 #include <pmacc/memory/buffers/GridBuffer.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/mappings/simulation/GridController.hpp>
 #include <pmacc/dataManagement/DataConnector.hpp>
 #include <pmacc/mappings/kernel/AreaMapping.hpp>
@@ -66,10 +65,10 @@ namespace picongpu
         m_commTagGather = pmacc::traits::getNextId( ) + SPECIES_FIRSTTAG;
 
         using Buffer = GridBuffer< ValueType, simDim >;
-        fieldTmp = memory::makeUnique< Buffer >( cellDescription.getGridLayout( ) );
+        fieldTmp = std::make_unique< Buffer >( cellDescription.getGridLayout( ) );
 
         if( fieldTmpSupportGatherCommunication )
-            fieldTmpRecv = memory::makeUnique< Buffer >(
+            fieldTmpRecv = std::make_unique< Buffer >(
                 fieldTmp->getDeviceBuffer(),
                 cellDescription.getGridLayout( )
             );
diff --git a/include/picongpu/particles/ParticlesFunctors.hpp b/include/picongpu/particles/ParticlesFunctors.hpp
index ae2e5140c0..103aa436f9 100644
--- a/include/picongpu/particles/ParticlesFunctors.hpp
+++ b/include/picongpu/particles/ParticlesFunctors.hpp
@@ -28,7 +28,6 @@
 #include <pmacc/Environment.hpp>
 #include <pmacc/communication/AsyncCommunication.hpp>
 #include <pmacc/particles/meta/FindByNameOrType.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 
 #include "picongpu/particles/traits/GetIonizerList.hpp"
 #if( PMACC_CUDA_ENABLED == 1 )
@@ -99,7 +98,7 @@ struct CreateSpecies
     {
         DataConnector &dc = Environment<>::get().DataConnector();
         dc.consume(
-            pmacc::memory::makeUnique<SpeciesType>(
+            std::make_unique<SpeciesType>(
                 deviceHeap,
                 *cellDesc,
                 FrameType::getName()
diff --git a/include/picongpu/particles/flylite/NonLTE.tpp b/include/picongpu/particles/flylite/NonLTE.tpp
index aae44341bc..9c840cbb09 100644
--- a/include/picongpu/particles/flylite/NonLTE.tpp
+++ b/include/picongpu/particles/flylite/NonLTE.tpp
@@ -27,11 +27,9 @@
 #include "picongpu/particles/particleToGrid/derivedAttributes/Density.def"
 #include "picongpu/particles/traits/GetShape.hpp"
 
-/* pmacc */
 #include <pmacc/Environment.hpp>
 #include <pmacc/dataManagement/ISimulationData.hpp>
 #include <pmacc/traits/GetNumWorkers.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 
 #include <memory>
 
@@ -64,11 +62,10 @@ namespace flylite
 
         DataConnector &dc = Environment<>::get().DataConnector();
 
-        using pmacc::memory::makeUnique;
         // once allocated for all ion species to share
         if( ! dc.hasId( helperFields::LocalEnergyHistogram::getName( "electrons" ) ) )
             dc.consume(
-                makeUnique< helperFields::LocalEnergyHistogram >(
+                std::make_unique< helperFields::LocalEnergyHistogram >(
                     "electrons",
                     m_avgGridSizeLocal
                 )
@@ -76,7 +73,7 @@ namespace flylite
 
         if( ! dc.hasId( helperFields::LocalEnergyHistogram::getName( "photons" ) ) )
             dc.consume(
-                makeUnique< helperFields::LocalEnergyHistogram >(
+                std::make_unique< helperFields::LocalEnergyHistogram >(
                     "photons",
                     m_avgGridSizeLocal
                 )
@@ -84,7 +81,7 @@ namespace flylite
 
         if( ! dc.hasId( helperFields::LocalDensity::getName( "electrons" ) ) )
             dc.consume(
-                makeUnique< helperFields::LocalDensity >(
+                std::make_unique< helperFields::LocalDensity >(
                     "electrons",
                     m_avgGridSizeLocal
                 )
@@ -93,7 +90,7 @@ namespace flylite
         // for each ion species
         if( ! dc.hasId( helperFields::LocalRateMatrix::getName( ionSpeciesName ) ) )
             dc.consume(
-                makeUnique< helperFields::LocalRateMatrix >(
+                std::make_unique< helperFields::LocalRateMatrix >(
                     ionSpeciesName,
                     m_avgGridSizeLocal
                 )
@@ -101,7 +98,7 @@ namespace flylite
 
         if( ! dc.hasId( helperFields::LocalDensity::getName( ionSpeciesName ) ) )
             dc.consume(
-                makeUnique< helperFields::LocalDensity >(
+                std::make_unique< helperFields::LocalDensity >(
                     ionSpeciesName,
                     m_avgGridSizeLocal
                 )
diff --git a/include/picongpu/plugins/xrayScattering/XrayScattering.hpp b/include/picongpu/plugins/xrayScattering/XrayScattering.hpp
index 85c9d0c5b6..c52d32d6fb 100644
--- a/include/picongpu/plugins/xrayScattering/XrayScattering.hpp
+++ b/include/picongpu/plugins/xrayScattering/XrayScattering.hpp
@@ -38,7 +38,6 @@
 #include <pmacc/dataManagement/DataConnector.hpp>
 #include <pmacc/dimensions/DataSpaceOperations.hpp>
 #include <pmacc/mappings/kernel/AreaMapping.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/mpi/MPIReduce.hpp>
 #include <pmacc/mpi/reduceMethods/Reduce.hpp>
 #include <pmacc/nvidia/functors/Add.hpp>
@@ -290,7 +289,7 @@ namespace xrayScattering
                  *   CoordinateTransform.hpp is still set to (0,0,0) when the
                  *   XrayScattering object is initialized.
                  */
-                probingBeam = pmacc::memory::makeUnique<
+                probingBeam = std::make_unique<
                     beam::XrayScatteringBeam >( );
                 // Set the steps at which the xrayScattering amplitude is
                 // calculated.
@@ -337,7 +336,7 @@ namespace xrayScattering
                     );
                 }
                 // Allocate amplitude buffer.
-                amplitude = pmacc::memory::makeUnique< ComplexBuffer >(
+                amplitude = std::make_unique< ComplexBuffer >(
                     DataSpace< DIM1 >( bufferSize ) );
                 // Initialize, on device, its fields with zero.
                 amplitude->getDeviceBuffer( ).setValue( 0.0 );
@@ -376,7 +375,7 @@ namespace xrayScattering
                     ).getGlobalDomain( ).size.productOfComponents( );
 
                 // Initialize an object responsible for output writing.
-                dataWriter = pmacc::memory::makeUnique< XrayScatteringWriter<
+                dataWriter = std::make_unique< XrayScatteringWriter<
                     float_X > >(
                         pluginPrefix + "Output",
                         fileExtension,
diff --git a/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp b/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp
index bec1dbcab4..f48680e84b 100644
--- a/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp
+++ b/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp
@@ -213,7 +213,7 @@ namespace  xrayScattering
                 if ( outputMemoryLayout == OutputMemoryLayout::Distribute )
                 {
                     // Open a series for a parallel write.
-                    openPMDSeries = pmacc::memory::makeUnique< ::openPMD::Series >(
+                    openPMDSeries = std::make_unique< ::openPMD::Series >(
                         fullName,
                         at,
                         mpiCommunicator
@@ -222,7 +222,7 @@ namespace  xrayScattering
                 else
                 {
                     // Open a series for a serial write.
-                    openPMDSeries = pmacc::memory::makeUnique< ::openPMD::Series >(
+                    openPMDSeries = std::make_unique< ::openPMD::Series >(
                         fullName,
                         at
                     );
diff --git a/include/picongpu/simulation/control/MySimulation.hpp b/include/picongpu/simulation/control/MySimulation.hpp
index 6e038578b9..cc12a59768 100644
--- a/include/picongpu/simulation/control/MySimulation.hpp
+++ b/include/picongpu/simulation/control/MySimulation.hpp
@@ -341,7 +341,7 @@ class MySimulation : public SimulationHelper<simDim>
         );
 
         using RNGFactory = pmacc::random::RNGProvider< simDim, random::Generator >;
-        auto rngFactory = pmacc::memory::makeUnique< RNGFactory >(
+        auto rngFactory = std::make_unique< RNGFactory >(
             Environment<simDim>::get().SubGrid().getLocalDomain().size
         );
         if (Environment<simDim>::get().GridController().getGlobalRank() == 0)
@@ -444,7 +444,7 @@ class MySimulation : public SimulationHelper<simDim>
         );
         cuplaStreamSynchronize( 0 );
 
-        auto mallocMCBuffer = pmacc::memory::makeUnique< MallocMCBuffer<DeviceHeap> >( deviceHeap );
+        auto mallocMCBuffer = std::make_unique< MallocMCBuffer<DeviceHeap> >( deviceHeap );
         dc.consume( std::move( mallocMCBuffer ) );
 #endif
         meta::ForEach< VectorAllSpecies, particles::LogMemoryStatisticsForSpecies<bmpl::_1> > logMemoryStatisticsForSpecies;
@@ -683,16 +683,15 @@ class MySimulation : public SimulationHelper<simDim>
 
     void initFields( DataConnector& dataConnector )
     {
-        using pmacc::memory::makeUnique;
-        auto fieldB = makeUnique< FieldB >( *cellDescription );
+        auto fieldB = std::make_unique< FieldB >( *cellDescription );
         dataConnector.consume( std::move( fieldB ) );
-        auto fieldE = makeUnique< FieldE >( *cellDescription );
+        auto fieldE = std::make_unique< FieldE >( *cellDescription );
         dataConnector.consume( std::move( fieldE ) );
-        auto fieldJ = makeUnique< FieldJ >( *cellDescription );
+        auto fieldJ = std::make_unique< FieldJ >( *cellDescription );
         dataConnector.consume( std::move( fieldJ ) );
         for( uint32_t slot = 0; slot < fieldTmpNumSlots; ++slot)
         {
-            auto fieldTmp = makeUnique< FieldTmp >( *cellDescription, slot );
+            auto fieldTmp = std::make_unique< FieldTmp >( *cellDescription, slot );
             dataConnector.consume( std::move( fieldTmp ) );
         }
     }
diff --git a/include/pmacc/memory/buffers/ExchangeIntern.hpp b/include/pmacc/memory/buffers/ExchangeIntern.hpp
index feb86ca3b0..9d4c5ce671 100644
--- a/include/pmacc/memory/buffers/ExchangeIntern.hpp
+++ b/include/pmacc/memory/buffers/ExchangeIntern.hpp
@@ -27,7 +27,6 @@
 #include "pmacc/memory/dataTypes/Mask.hpp"
 #include "pmacc/memory/buffers/DeviceBufferIntern.hpp"
 #include "pmacc/memory/buffers/HostBufferIntern.hpp"
-#include "pmacc/memory/MakeUnique.hpp"
 
 #include "pmacc/eventSystem/tasks/Factory.hpp"
 #include "pmacc/eventSystem/tasks/TaskReceive.hpp"
@@ -72,7 +71,7 @@ namespace pmacc
             /*This is only a pointer to other device data
              */
             using DeviceBuffer = DeviceBufferIntern<TYPE, DIM>;
-            deviceBuffer = memory::makeUnique<DeviceBuffer>(
+            deviceBuffer = std::make_unique<DeviceBuffer>(
                 source,
                 tmp_size,
                 exchangeTypeToOffset(
@@ -86,7 +85,7 @@ namespace pmacc
             if (DIM > DIM1)
             {
                 /*create double buffer on gpu for faster memory transfers*/
-                deviceDoubleBuffer = memory::makeUnique<DeviceBuffer>(
+                deviceDoubleBuffer = std::make_unique<DeviceBuffer>(
                     tmp_size,
                     false,
                     true
@@ -96,7 +95,7 @@ namespace pmacc
             if(!Environment<>::get().isMpiDirectEnabled())
             {
                 using HostBuffer = HostBufferIntern<TYPE, DIM>;
-                hostBuffer = memory::makeUnique<HostBuffer>(tmp_size);
+                hostBuffer = std::make_unique<HostBuffer>(tmp_size);
             }
         }
 
@@ -105,7 +104,7 @@ namespace pmacc
         Exchange<TYPE, DIM>(exchange, communicationTag), deviceDoubleBuffer(nullptr), hostBuffer(nullptr)
         {
             using DeviceBuffer = DeviceBufferIntern<TYPE, DIM >;
-            deviceBuffer = memory::makeUnique<DeviceBuffer>(
+            deviceBuffer = std::make_unique<DeviceBuffer>(
                 exchangeDataSpace,
                 sizeOnDevice
             );
@@ -113,7 +112,7 @@ namespace pmacc
             if (DIM > DIM1)
             {
                 /*create double buffer on gpu for faster memory transfers*/
-                deviceDoubleBuffer = memory::makeUnique<DeviceBuffer>(
+                deviceDoubleBuffer = std::make_unique<DeviceBuffer>(
                     exchangeDataSpace,
                     false,
                     true
@@ -123,7 +122,7 @@ namespace pmacc
             if(!Environment<>::get().isMpiDirectEnabled())
             {
                 using HostBuffer = HostBufferIntern<TYPE, DIM>;
-                hostBuffer = memory::makeUnique<HostBuffer>(exchangeDataSpace);
+                hostBuffer = std::make_unique<HostBuffer>(exchangeDataSpace);
             }
         }
 
diff --git a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp
index 2fd631723e..d56ece4e8f 100644
--- a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp
+++ b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp
@@ -26,7 +26,6 @@
 #include <pmacc/nvidia/functors/Assign.hpp>
 #include <pmacc/memory/boxes/CachedBox.hpp>
 #include <pmacc/memory/dataTypes/Mask.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/dimensions/DataSpaceOperations.hpp>
 #include <pmacc/random/distributions/distributions.hpp>
 #include <pmacc/random/methods/methods.hpp>
@@ -246,7 +245,7 @@ namespace kernel
             Space const & guardSize
         )
         {
-            mapping = memory::makeUnique< T_MappingDesc >(
+            mapping = std::make_unique< T_MappingDesc >(
                 layout,
                 guardSize
             );

From 84b8c26ea34fbda0a2b11ba3425b0bc74a39f2e5 Mon Sep 17 00:00:00 2001
From: Sergei Bastrakov <sergey.bastrakov@gmail.com>
Date: Thu, 17 Sep 2020 09:43:31 +0200
Subject: [PATCH 10/13] Remove pmacc::memory::makeUnique that is no longer used

---
 include/pmacc/memory/MakeUnique.hpp | 48 -----------------------------
 1 file changed, 48 deletions(-)
 delete mode 100644 include/pmacc/memory/MakeUnique.hpp

diff --git a/include/pmacc/memory/MakeUnique.hpp b/include/pmacc/memory/MakeUnique.hpp
deleted file mode 100644
index 38d92ad6bf..0000000000
--- a/include/pmacc/memory/MakeUnique.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2019-2020 Sergei Bastrakov
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <memory>
-#include <utility>
-
-
-namespace pmacc
-{
-namespace memory
-{
-
-    /*
-     * Analogue of std::make_unique for C++11, except not disabled for arrays.
-     * Implementation is taken from
-     * https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
-     */
-    template<
-        typename T,
-        typename ... T_Args
-    >
-    inline std::unique_ptr< T > makeUnique( T_Args && ... args )
-    {
-        return std::unique_ptr< T >( new T( std::forward< T_Args >( args ) ... ) );
-    }
-
-} // namespace memory
-} // namespace pmacc

From 8aefa63503dbfff92981f56da575c6a51bb4f459 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Fri, 18 Sep 2020 09:14:46 +0200
Subject: [PATCH 11/13] HIP: RNG XorMin

Add HIP support for random number generator XorMin.
---
 include/pmacc/Environment.hpp           | 10 ++++-
 include/pmacc/random/methods/XorMin.hpp | 51 +++++++++++++++++++------
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/include/pmacc/Environment.hpp b/include/pmacc/Environment.hpp
index b35facbab4..7fe06a5554 100644
--- a/include/pmacc/Environment.hpp
+++ b/include/pmacc/Environment.hpp
@@ -490,9 +490,15 @@ namespace detail
             const int tryDeviceId = (deviceOffset + deviceNumber) % num_gpus;
 
             log<ggLog::CUDA_RT>("Trying to allocate device %1%.") % tryDeviceId;
-#if (PMACC_CUDA_ENABLED == 1)
+
+#if(BOOST_LANG_CUDA || BOOST_LANG_HIP)
+#   if(BOOST_LANG_CUDA)
             cudaDeviceProp devProp;
-            CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&devProp, tryDeviceId));
+#   elif(BOOST_LANG_HIP)
+            hipDeviceProp_t devProp;
+#   endif
+
+            CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId));
 
             /* If the cuda gpu compute mode is 'default'
              * (https://docs.nvidia.com/cuda/cuda-c-programming-guide/#compute-modes)
diff --git a/include/pmacc/random/methods/XorMin.hpp b/include/pmacc/random/methods/XorMin.hpp
index 10850befcf..1344b24310 100644
--- a/include/pmacc/random/methods/XorMin.hpp
+++ b/include/pmacc/random/methods/XorMin.hpp
@@ -24,10 +24,12 @@
 #include "pmacc/types.hpp"
 #include "pmacc/static_assert.hpp"
 
-#if( PMACC_CUDA_ENABLED != 1 )
-#   include "pmacc/random/methods/AlpakaRand.hpp"
-#else
+#if( BOOST_LANG_CUDA )
 #   include <curand_kernel.h>
+#elif( BOOST_LANG_HIP )
+#   include <hiprand_kernel.h>
+#else
+#   include "pmacc/random/methods/AlpakaRand.hpp"
 #endif
 
 
@@ -38,15 +40,17 @@ namespace random
 namespace methods
 {
 
-#if( PMACC_CUDA_ENABLED != 1 )
-    //! fallback to alpaka RNG if a cpu accelerator is used
-    template< typename T_Acc = cupla::Acc>
-    using XorMin = AlpakaRand< T_Acc >;
-#else
+#if( BOOST_LANG_CUDA || BOOST_LANG_HIP )
     //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution
     template< typename T_Acc = cupla::Acc>
     class XorMin
     {
+#if (BOOST_LANG_HIP)
+        using NativeStateType = hiprandStateXORWOW_t;
+#elif (BOOST_LANG_CUDA)
+        using NativeStateType = curandStateXORWOW_t;
+#endif
+
     public:
         class StateType
         {
@@ -63,14 +67,23 @@ namespace methods
             HDINLINE StateType( )
             { }
 
-            DINLINE StateType( curandStateXORWOW_t const & other ): d( other.d )
+            DINLINE StateType( NativeStateType const & other ): d( other.d )
             {
+#if (BOOST_LANG_HIP)
+                auto const* nativeStateArray = other.x;
+                PMACC_STATIC_ASSERT_MSG(
+                    sizeof( v ) == sizeof( other.x ),
+                    Unexpected_sizes
+                );
+#elif (BOOST_LANG_CUDA)
+                auto const* nativeStateArray = other.v;
                 PMACC_STATIC_ASSERT_MSG(
                     sizeof( v ) == sizeof( other.v ),
                     Unexpected_sizes
                 );
+#endif
                 for( unsigned i = 0; i < sizeof( v ) / sizeof( v[ 0 ] ); i++ )
-                    v[ i ] = other.v[ i ];
+                    v[ i ] = nativeStateArray[ i ];
             }
         };
 
@@ -82,13 +95,23 @@ namespace methods
             uint32_t subsequence = 0
         ) const
         {
-            curandStateXORWOW_t tmpState;
-            curand_init(
+            NativeStateType tmpState;
+
+#if (BOOST_LANG_HIP)
+#   define define PMACC_RNG_INIT_FN hiprand_init
+#elif (BOOST_LANG_CUDA)
+#   define define PMACC_RNG_INIT_FN curand_init
+#endif
+
+            PMACC_RNG_INIT_FN(
                 seed,
                 subsequence,
                 0,
                 &tmpState
             );
+
+#undef PMACC_RNG_INIT_FN
+
             state = tmpState;
         }
 
@@ -132,6 +155,10 @@ namespace methods
             return "XorMin";
         }
     };
+#else
+    //! fallback to alpaka RNG if a cpu accelerator is used
+    template< typename T_Acc = cupla::Acc>
+    using XorMin = AlpakaRand< T_Acc >;
 #endif
 }  // namespace methods
 }  // namespace random

From a27b90e48ffc81092271a680aa207879ead07936 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Fri, 18 Sep 2020 09:16:35 +0200
Subject: [PATCH 12/13] HIP compatibility

Increase HIP compatibility to PMacc and PIConGPU.
---
 include/picongpu/_defaultParam.loader         |  2 +-
 .../fields/currentDeposition/Strategy.def     | 14 +++++++++++++
 include/picongpu/particles/Particles.hpp      |  2 +-
 .../picongpu/particles/ParticlesFunctors.hpp  |  2 +-
 .../simulation/control/MySimulation.hpp       | 12 +++++++----
 include/pmacc/Environment.hpp                 | 15 +++++++-------
 include/pmacc/PMaccConfig.cmake               |  2 +-
 .../algorithms/math/doubleMath/bessel.tpp     | 16 +++++++--------
 .../math/doubleMath/floatingPoint.tpp         |  6 +++---
 .../pmacc/algorithms/math/doubleMath/modf.tpp |  2 +-
 .../algorithms/math/floatMath/bessel.tpp      | 16 +++++++--------
 .../pmacc/algorithms/math/floatMath/exp.tpp   |  2 +-
 .../math/floatMath/floatingPoint.tpp          |  6 +++---
 .../pmacc/algorithms/math/floatMath/modf.tpp  |  2 +-
 include/pmacc/math/ConstVector.hpp            |  6 +++---
 .../particles/memory/boxes/ParticlesBox.hpp   | 20 ++++++++++---------
 include/pmacc/random/methods/XorMin.hpp       | 18 ++++++++---------
 include/pmacc/static_assert.hpp               |  2 +-
 include/pmacc/types.hpp                       |  2 +-
 19 files changed, 83 insertions(+), 64 deletions(-)

diff --git a/include/picongpu/_defaultParam.loader b/include/picongpu/_defaultParam.loader
index c9d3b14f75..04c53a8c58 100644
--- a/include/picongpu/_defaultParam.loader
+++ b/include/picongpu/_defaultParam.loader
@@ -26,7 +26,7 @@
 #pragma once
 
 #include "picongpu/param/dimension.param"
-#if( PMACC_CUDA_ENABLED == 1 )
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
 #   include "picongpu/param/mallocMC.param"
 #endif
 #include "picongpu/param/memory.param"
diff --git a/include/picongpu/fields/currentDeposition/Strategy.def b/include/picongpu/fields/currentDeposition/Strategy.def
index 6950665ccd..f4963bea30 100644
--- a/include/picongpu/fields/currentDeposition/Strategy.def
+++ b/include/picongpu/fields/currentDeposition/Strategy.def
@@ -143,6 +143,20 @@ namespace traits
         alpaka::acc::AccGpuCudaRt< T_Args... >
     >
     {
+        // GPU Utilization is higher compared to `StridedCachedSupercells`
+        using type = strategy::CachedSupercells;
+    };
+#endif
+
+#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 )
+    template<
+        typename ... T_Args
+    >
+    struct GetDefaultStrategy<
+        alpaka::acc::AccGpuHipRt< T_Args... >
+    >
+    {
+        // GPU Utilization is higher compared to `StridedCachedSupercells`
         using type = strategy::CachedSupercells;
     };
 #endif
diff --git a/include/picongpu/particles/Particles.hpp b/include/picongpu/particles/Particles.hpp
index 08aaccb71d..eda48ded67 100644
--- a/include/picongpu/particles/Particles.hpp
+++ b/include/picongpu/particles/Particles.hpp
@@ -49,7 +49,7 @@ namespace picongpu
 {
 using namespace pmacc;
 
-#if( PMACC_CUDA_ENABLED != 1 )
+#if(!BOOST_LANG_CUDA && !BOOST_COMP_HIP)
 /* dummy because we are not using mallocMC with cupla
  * DeviceHeap is defined in `mallocMC.param`
  */
diff --git a/include/picongpu/particles/ParticlesFunctors.hpp b/include/picongpu/particles/ParticlesFunctors.hpp
index 103aa436f9..4630bfcf96 100644
--- a/include/picongpu/particles/ParticlesFunctors.hpp
+++ b/include/picongpu/particles/ParticlesFunctors.hpp
@@ -125,7 +125,7 @@ struct LogMemoryStatisticsForSpecies
         const std::shared_ptr<T_DeviceHeap>& deviceHeap
     ) const
     {
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         log<picLog::MEMORY >("mallocMC: free slots for species %3%: %1% a %2%") %
             deviceHeap->getAvailableSlots(
                 cupla::manager::Device< cupla::AccDev >::get().current(),
diff --git a/include/picongpu/simulation/control/MySimulation.hpp b/include/picongpu/simulation/control/MySimulation.hpp
index cc12a59768..57395bab49 100644
--- a/include/picongpu/simulation/control/MySimulation.hpp
+++ b/include/picongpu/simulation/control/MySimulation.hpp
@@ -373,7 +373,9 @@ class MySimulation : public SimulationHelper<simDim>
 
             this->bremsstrahlungPhotonAngle.init();
         }
+#endif
 
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         auto nativeCudaStream = cupla::manager::Stream<
             cupla::AccDev,
             cupla::AccStream
@@ -425,7 +427,7 @@ class MySimulation : public SimulationHelper<simDim>
             throw std::runtime_error(msg.str());
         }
 
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         size_t heapSize = freeGpuMem - reservedGpuMemorySize;
 
         if( Environment<>::get().MemoryInfo().isSharedMemoryPool() )
@@ -443,10 +445,12 @@ class MySimulation : public SimulationHelper<simDim>
             heapSize
         );
         cuplaStreamSynchronize( 0 );
-
-        auto mallocMCBuffer = std::make_unique< MallocMCBuffer<DeviceHeap> >( deviceHeap );
+#   if( PMACC_CUDA_ENABLED == 1 )
+        auto mallocMCBuffer = std::make_unique< MallocMCBuffer< DeviceHeap > >( deviceHeap );
         dc.consume( std::move( mallocMCBuffer ) );
+#   endif
 #endif
+
         meta::ForEach< VectorAllSpecies, particles::LogMemoryStatisticsForSpecies<bmpl::_1> > logMemoryStatisticsForSpecies;
         logMemoryStatisticsForSpecies( deviceHeap );
 
@@ -455,7 +459,7 @@ class MySimulation : public SimulationHelper<simDim>
 
         IdProvider<simDim>::init();
 
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         /* add CUDA streams to the StreamController for concurrent execution */
         Environment<>::get().StreamController().addStreams(6);
 #endif
diff --git a/include/pmacc/Environment.hpp b/include/pmacc/Environment.hpp
index 7fe06a5554..7d222d65d2 100644
--- a/include/pmacc/Environment.hpp
+++ b/include/pmacc/Environment.hpp
@@ -468,7 +468,7 @@ namespace detail
     {
         int num_gpus = 0; //number of gpus
         cuplaGetDeviceCount(&num_gpus);
-#if (PMACC_CUDA_ENABLED == 1)
+#if(BOOST_LANG_CUDA|| BOOST_COMP_HIP)
         //##ERROR handling
         if (num_gpus < 1) //check if cupla device is found
         {
@@ -506,7 +506,7 @@ namespace detail
              * The index used to select a device is based on the local MPI rank so
              * that each rank tries a different device.
              */
-            if (devProp.computeMode == cudaComputeModeDefault)
+            if (devProp.computeMode == ALPAKA_API_PREFIX(ComputeModeDefault))
             {
                 maxTries = 1;
                 log<ggLog::CUDA_RT>("Device %1% is running in default mode.") % tryDeviceId;
@@ -532,18 +532,17 @@ namespace detail
 
             if (rc == cuplaSuccess)
             {
-#if (PMACC_CUDA_ENABLED == 1)
-                cudaDeviceProp dprop;
-                CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&dprop, tryDeviceId));
-                log<ggLog::CUDA_RT> ("Set device to %1%: %2%") % tryDeviceId % dprop.name;
-                if(cudaErrorSetOnActiveProcess == cudaSetDeviceFlags(cudaDeviceScheduleSpin))
+#if(BOOST_LANG_CUDA || BOOST_LANG_HIP)
+                CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId));
+                log<ggLog::CUDA_RT> ("Set device to %1%: %2%") % tryDeviceId % devProp.name;
+                if(ALPAKA_API_PREFIX(ErrorSetOnActiveProcess) == ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin)))
                 {
                     cuplaGetLastError(); //reset all errors
                     /* - because of cuplaStreamCreate was called cuplaSetDeviceFlags crashed
                      * - to set the flags reset the device and set flags again
                      */
                     CUDA_CHECK(cuplaDeviceReset());
-                    CUDA_CHECK((cuplaError_t)cudaSetDeviceFlags(cudaDeviceScheduleSpin));
+                    CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin)));
                 }
 #endif
                 CUDA_CHECK(cuplaGetLastError());
diff --git a/include/pmacc/PMaccConfig.cmake b/include/pmacc/PMaccConfig.cmake
index 2878eacee3..2c0a14e481 100644
--- a/include/pmacc/PMaccConfig.cmake
+++ b/include/pmacc/PMaccConfig.cmake
@@ -368,7 +368,7 @@ endif()
 # Find mallocMC
 ################################################################################
 
-if(ALPAKA_ACC_GPU_CUDA_ENABLE)
+if(ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE)
     set(mallocMC_ALPAKA_PROVIDER "extern" CACHE STRING "Select which alpaka is used for mallocMC")
     find_package(mallocMC 2.5.0 QUIET)
 
diff --git a/include/pmacc/algorithms/math/doubleMath/bessel.tpp b/include/pmacc/algorithms/math/doubleMath/bessel.tpp
index c6f4af59be..b099bef2f8 100644
--- a/include/pmacc/algorithms/math/doubleMath/bessel.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/bessel.tpp
@@ -39,7 +39,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::cyl_bessel_i0( x );
 #else
             return boost::math::cyl_bessel_i(
@@ -57,7 +57,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::cyl_bessel_i1( x );
 #else
             return boost::math::cyl_bessel_i(
@@ -75,7 +75,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::j0( x );
 #else
             return boost::math::cyl_bessel_j(
@@ -93,7 +93,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::j1( x );
 #else
             return boost::math::cyl_bessel_j(
@@ -117,7 +117,7 @@ namespace bessel
             result const & x
         )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::jn(
                 n,
                 x
@@ -138,7 +138,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::y0( x );
 #else
             return boost::math::cyl_neumann(
@@ -156,7 +156,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::y1( x );
 #else
             return boost::math::cyl_neumann(
@@ -180,7 +180,7 @@ namespace bessel
             result const & x
         )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::yn(
                 n,
                 x
diff --git a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp
index 2f32967fd3..ec3d7c7a46 100644
--- a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp
@@ -39,7 +39,7 @@ struct Float2int_ru<double>
 
     HDINLINE result operator( )(double value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__double2int_ru( value );
 #else
         return static_cast<int>(ceil(value));
@@ -54,7 +54,7 @@ struct Float2int_rd<double>
 
     HDINLINE result operator( )(double value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__double2int_rd( value );
 #else
         return static_cast<int>(floor(value));
@@ -69,7 +69,7 @@ struct Float2int_rn<double>
 
     HDINLINE result operator( )(double value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__double2int_rn( value );
 #else
         if(value < 0.0)
diff --git a/include/pmacc/algorithms/math/doubleMath/modf.tpp b/include/pmacc/algorithms/math/doubleMath/modf.tpp
index 92ec4741da..b1532568c2 100644
--- a/include/pmacc/algorithms/math/doubleMath/modf.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/modf.tpp
@@ -36,7 +36,7 @@ struct Modf<double>
 
     HDINLINE double operator()(double value, double* intpart)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::modf(value, intpart);
 #else
         return std::modf(value, intpart);
diff --git a/include/pmacc/algorithms/math/floatMath/bessel.tpp b/include/pmacc/algorithms/math/floatMath/bessel.tpp
index e627ee012e..15554587d6 100644
--- a/include/pmacc/algorithms/math/floatMath/bessel.tpp
+++ b/include/pmacc/algorithms/math/floatMath/bessel.tpp
@@ -39,7 +39,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::cyl_bessel_i0f( x );
 #else
             return boost::math::cyl_bessel_i(
@@ -57,7 +57,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::cyl_bessel_i1f( x );
 #else
             return boost::math::cyl_bessel_i(
@@ -75,7 +75,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu_
             return ::j0f( x );
 #else
             return boost::math::cyl_bessel_j(
@@ -93,7 +93,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::j1f( x );
 #else
             return boost::math::cyl_bessel_j(
@@ -117,7 +117,7 @@ namespace bessel
             result const & x
         )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::jnf(
                 n,
                 x
@@ -138,7 +138,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::y0f( x );
 #else
             return boost::math::cyl_neumann(
@@ -156,7 +156,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::y1f( x );
 #else
             return boost::math::cyl_neumann(
@@ -180,7 +180,7 @@ namespace bessel
             result const & x
         )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::ynf(
                 n,
                 x
diff --git a/include/pmacc/algorithms/math/floatMath/exp.tpp b/include/pmacc/algorithms/math/floatMath/exp.tpp
index 772dcf87a9..97ae7e0d13 100644
--- a/include/pmacc/algorithms/math/floatMath/exp.tpp
+++ b/include/pmacc/algorithms/math/floatMath/exp.tpp
@@ -38,7 +38,7 @@ namespace math
 
         HDINLINE float operator( )(const float& value)
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::log10f( value );
 #else
             return ::log10( value );
diff --git a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp
index 206b0118f1..681f33e21a 100644
--- a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp
+++ b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp
@@ -39,7 +39,7 @@ struct Float2int_ru<float>
 
     HDINLINE result operator( )(float value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__float2int_ru( value );
 #else
         return static_cast<int>(ceil(value));
@@ -54,7 +54,7 @@ struct Float2int_rd<float>
 
     HDINLINE result operator( )(float value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__float2int_rd( value );
 #else
         return static_cast<int>(floor(value));
@@ -69,7 +69,7 @@ struct Float2int_rn<float>
 
     HDINLINE result operator( )(float value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__float2int_rn( value );
 #else
         if(value < 0.0f)
diff --git a/include/pmacc/algorithms/math/floatMath/modf.tpp b/include/pmacc/algorithms/math/floatMath/modf.tpp
index d2678d179e..59efffd3ae 100644
--- a/include/pmacc/algorithms/math/floatMath/modf.tpp
+++ b/include/pmacc/algorithms/math/floatMath/modf.tpp
@@ -36,7 +36,7 @@ struct Modf<float>
 
     HDINLINE float operator()(float value, float* intpart)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::modff(value, intpart);
 #else
         return std::modf(value, intpart);
diff --git a/include/pmacc/math/ConstVector.hpp b/include/pmacc/math/ConstVector.hpp
index b1fbad1b0a..b2b4df31de 100644
--- a/include/pmacc/math/ConstVector.hpp
+++ b/include/pmacc/math/ConstVector.hpp
@@ -26,13 +26,13 @@
 #include "pmacc/types.hpp"
 
 /* select namespace depending on __CUDA_ARCH__ compiler flag*/
-#ifdef __CUDA_ARCH__ //we are on gpu
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
 #   define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_device,id)
 #else
 #   define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_host,id)
 #endif
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || BOOST_COMP_HIP
 #   define PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id,Name,Type,...)                \
         namespace PMACC_JOIN(pmacc_static_const_vector_device,id)                  \
         {                                                                          \
@@ -87,7 +87,7 @@ namespace PMACC_JOIN(pmacc_static_const_storage,id)                            \
 } /* namespace pmacc_static_const_storage + id */                              \
 using namespace PMACC_JOIN(pmacc_static_const_storage,id)
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || BOOST_COMP_HIP
 #   define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name,id)                \
         namespace PMACC_JOIN(pmacc_static_const_vector_device,id)              \
         {                                                                      \
diff --git a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp
index 031aae887b..aabb323990 100644
--- a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp
+++ b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp
@@ -22,7 +22,7 @@
 
 #pragma once
 
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
 #   include <mallocMC/mallocMC.hpp>
 #endif
 #include "pmacc/particles/frame_types.hpp"
@@ -97,7 +97,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
         const int maxTries = 13; //magic number is not performance critical
         for ( int numTries = 0; numTries < maxTries; ++numTries )
         {
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
             tmp = (FrameType*) m_deviceHeapHandle.malloc( acc, sizeof (FrameType) );
 #else
             tmp = new FrameType;
@@ -107,7 +107,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
                 /* disable all particles since we can not assume that newly allocated memory contains zeros */
                 for ( int i = 0; i < (int) math::CT::volume<typename FrameType::SuperCellSize>::type::value; ++i )
                     ( *tmp )[i][multiMask_] = 0;
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
                 /* takes care that changed values are visible to all threads inside this block*/
                 __threadfence_block( );
 #endif
@@ -115,10 +115,12 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
             }
             else
             {
+#ifndef BOOST_COMP_HIP
                 printf( "%s: mallocMC out of memory (try %i of %i)\n",
                         (numTries + 1) == maxTries ? "ERROR" : "WARNING",
                         numTries + 1,
                         maxTries );
+#endif
             }
         }
 
@@ -133,7 +135,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
     template<typename T_Acc>
     DINLINE void removeFrame( const T_Acc & acc, FramePtr& frame )
     {
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         m_deviceHeapHandle.free( acc, (void*) frame.ptr );
 #else
         delete(frame.ptr);
@@ -144,14 +146,14 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
     HDINLINE
     FramePtr mapPtr( const FramePtr& devPtr ) const
     {
-#ifndef __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1)
+        return devPtr;
+#else
         int64_t useOffset = hostMemoryOffset * static_cast<int64_t> (devPtr.ptr != 0);
         return FramePtr( reinterpret_cast<FrameType*> (
                                                        reinterpret_cast<char*> (devPtr.ptr) - useOffset
                                                        )
                         );
-#else
-        return devPtr;
 #endif
     }
 
@@ -218,7 +220,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
 
         frame->previousFrame = FramePtr( );
         frame->nextFrame = FramePtr( *firstFrameNativPtr );
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         /* - takes care that `next[index]` is visible to all threads on the gpu
          * - this is needed because later on in this method we change `previous`
          *   of an other frame, this must be done in order!
@@ -267,7 +269,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
 
         frame->nextFrame = FramePtr( );
         frame->previousFrame = FramePtr( *lastFrameNativPtr );
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         /* - takes care that `next[index]` is visible to all threads on the gpu
          * - this is needed because later on in this method we change `next`
          *   of an other frame, this must be done in order!
diff --git a/include/pmacc/random/methods/XorMin.hpp b/include/pmacc/random/methods/XorMin.hpp
index 1344b24310..5822897d67 100644
--- a/include/pmacc/random/methods/XorMin.hpp
+++ b/include/pmacc/random/methods/XorMin.hpp
@@ -40,14 +40,14 @@ namespace random
 namespace methods
 {
 
-#if( BOOST_LANG_CUDA || BOOST_LANG_HIP )
+#if( ALPAKA_ACC_GPU_CUDA_ENABLED || ALPAKA_ACC_GPU_HIP_ENABLED )
     //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution
     template< typename T_Acc = cupla::Acc>
     class XorMin
     {
-#if (BOOST_LANG_HIP)
+#if( BOOST_LANG_HIP )
         using NativeStateType = hiprandStateXORWOW_t;
-#elif (BOOST_LANG_CUDA)
+#elif( BOOST_LANG_CUDA )
         using NativeStateType = curandStateXORWOW_t;
 #endif
 
@@ -69,13 +69,13 @@ namespace methods
 
             DINLINE StateType( NativeStateType const & other ): d( other.d )
             {
-#if (BOOST_LANG_HIP)
+#if( BOOST_LANG_HIP )
                 auto const* nativeStateArray = other.x;
                 PMACC_STATIC_ASSERT_MSG(
                     sizeof( v ) == sizeof( other.x ),
                     Unexpected_sizes
                 );
-#elif (BOOST_LANG_CUDA)
+#elif( BOOST_LANG_CUDA )
                 auto const* nativeStateArray = other.v;
                 PMACC_STATIC_ASSERT_MSG(
                     sizeof( v ) == sizeof( other.v ),
@@ -97,10 +97,10 @@ namespace methods
         {
             NativeStateType tmpState;
 
-#if (BOOST_LANG_HIP)
-#   define define PMACC_RNG_INIT_FN hiprand_init
-#elif (BOOST_LANG_CUDA)
-#   define define PMACC_RNG_INIT_FN curand_init
+#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 )
+#   define PMACC_RNG_INIT_FN hiprand_init
+#elif( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 )
+#   define PMACC_RNG_INIT_FN curand_init
 #endif
 
             PMACC_RNG_INIT_FN(
diff --git a/include/pmacc/static_assert.hpp b/include/pmacc/static_assert.hpp
index 7f279572e0..1f730bbf36 100644
--- a/include/pmacc/static_assert.hpp
+++ b/include/pmacc/static_assert.hpp
@@ -45,7 +45,7 @@ namespace pmacc
  * @param pmacc_unique_id pre compiler unique id
  * @param pmacc_typeInfo a type that is shown in error message
  */
-#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA
+#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP
 /* device compile with clang: boost static assert can not be used
  * error is: calling a `__host__` function from `__device__`
  * Therefore C++11 `static_assert` is used
diff --git a/include/pmacc/types.hpp b/include/pmacc/types.hpp
index 826219474d..7230641f36 100644
--- a/include/pmacc/types.hpp
+++ b/include/pmacc/types.hpp
@@ -33,7 +33,7 @@
 #   define PMACC_CUDA_ENABLED ALPAKA_ACC_GPU_CUDA_ENABLED
 #endif
 
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
 /* include mallocMC before cupla renaming is activated, else we need the variable acc
  * to call atomic cuda functions
  */

From 315bd9a4286beb370ba287dac47dd25b9b0ec5a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Thu, 17 Sep 2020 11:28:22 +0200
Subject: [PATCH 13/13] radiation plugin: add new option

Add runtime option `numTmpResults` to increase independent work on a
device.
If set to one the code will behaves as before this PR.
The default will is two to utilize modern GPU devices with a
typicaly configuration.
---
 docs/TBG_macros.cfg                           |  3 +-
 docs/source/usage/plugins/radiation.rst       |  8 ++++-
 .../picongpu/plugins/radiation/Radiation.hpp  | 33 +++++++++++++++----
 .../plugins/radiation/Radiation.kernel        |  8 +++--
 4 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/docs/TBG_macros.cfg b/docs/TBG_macros.cfg
index e11947e906..c09d4ff815 100644
--- a/docs/TBG_macros.cfg
+++ b/docs/TBG_macros.cfg
@@ -124,7 +124,8 @@ TBG_stopWindow="--stopWindow 1337"
 #--<species>_radiation.end     Time step to stop calculating the radiation
 #--<species>_radiation.radPerGPU     If flag is set, each GPU stores its own spectra without summing the entire simulation area
 #--<species>_radiation.folderRadPerGPU     Folder where the GPU specific spectras are stored
-#--e_<species>_radiation.compression    If flag is set, the hdf5 output will be compressed.
+#--<species>_radiation.compression    If flag is set, the hdf5 output will be compressed.
+#--<species>_radiation.numJobs     Number of independent jobs used for the radiation calculation.
 TBG_radiation="--<species>_radiation.period 1 --<species>_radiation.dump 2 --<species>_radiation.totalRadiation \
                --<species>_radiation.lastRadiation --<species>_radiation.start 2800 --<species>_radiation.end 3000"
 
diff --git a/docs/source/usage/plugins/radiation.rst b/docs/source/usage/plugins/radiation.rst
index 002035a3b4..a47cc8580e 100644
--- a/docs/source/usage/plugins/radiation.rst
+++ b/docs/source/usage/plugins/radiation.rst
@@ -287,6 +287,11 @@ Command line option                       Description
 ``--<species>_radiation.folderRadPerGPU`` Name of the folder, where the GPU specific spectra are stored.
                                           Default: ``radPerGPU``
 ``--<species>_radiation.compression``     If set, the hdf5 output is compressed.
+``--<species>_radiation.numJobs``         Number of independent jobs used for the radiation calculation.
+                                          This option is used to increase the utilization of the device by producing more independent work.
+                                          This option enables accumulation of data in parallel into multiple temporary arrays, thereby increasing the utilization of
+                                          the device by increasing the memory footprint
+                                          Default: ``2``
 ========================================= ==============================================================================================================================
 
 Memory Complexity
@@ -295,7 +300,8 @@ Memory Complexity
 Accelerator
 """""""""""
 
-each energy bin times each coordinate bin allocates one counter (``float_X``) permanently and on each accelerator.
+locally, ``numJobs`` times number of frequencies ``N_omega`` times number of directions ``N_theta`` is permanently allocated.
+Each result element (amplitude) is a double precision complex number.
 
 Host
 """"
diff --git a/include/picongpu/plugins/radiation/Radiation.hpp b/include/picongpu/plugins/radiation/Radiation.hpp
index d07700bc79..f05912cd4e 100644
--- a/include/picongpu/plugins/radiation/Radiation.hpp
+++ b/include/picongpu/plugins/radiation/Radiation.hpp
@@ -77,7 +77,6 @@ namespace idLabels
 }// end namespace idLabels
 
 
-
 ///////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////  Radiation Plugin Class  ////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////
@@ -97,8 +96,10 @@ class Radiation : public ISimulationPlugin
      * frequency. Layout of the radiation array is:
      * [omega_1(theta_1),omega_2(theta_1),...,omega_N-omega(theta_1),
      *   omega_1(theta_2),omega_2(theta_2),...,omega_N-omega(theta_N-theta)]
+     * The second dimension is used to store intermediate results if command
+     * line option numJobs is > 1.
      */
-    GridBuffer<Amplitude, DIM1> *radiation;
+    GridBuffer<Amplitude, 2> *radiation;
     radiation_frequencies::InitFreqFunctor freqInit;
     radiation_frequencies::FreqFunctor freqFkt;
 
@@ -119,6 +120,7 @@ class Radiation : public ISimulationPlugin
     bool radPerGPU;
     std::string folderRadPerGPU;
     DataSpace<simDim> lastGPUpos;
+    int numJobs;
 
     /**
      * Data structure for storage and summation of the intermediate values of
@@ -214,7 +216,8 @@ class Radiation : public ISimulationPlugin
             ((pluginPrefix + ".end").c_str(), po::value<uint32_t > (&radEnd)->default_value(0), "time index when radiation should end with calculation")
             ((pluginPrefix + ".radPerGPU").c_str(), po::bool_switch(&radPerGPU), "enable radiation output from each GPU individually")
             ((pluginPrefix + ".folderRadPerGPU").c_str(), po::value<std::string > (&folderRadPerGPU)->default_value("radPerGPU"), "folder in which the radiation of each GPU is written")
-            ((pluginPrefix + ".compression").c_str(), po::bool_switch(&compressionOn), "enable compression of hdf5 output");
+            ((pluginPrefix + ".compression").c_str(), po::bool_switch(&compressionOn), "enable compression of hdf5 output")
+            ((pluginPrefix + ".numJobs").c_str(), po::value<int > (&numJobs)->default_value(2), "Number of independent jobs used for the radiation calculation.");
     }
 
 
@@ -282,13 +285,22 @@ class Radiation : public ISimulationPlugin
     {
         if(!notifyPeriod.empty())
         {
+            if(numJobs <= 0)
+            {
+                std::cerr << "'numJobs' must be '>=1' value is adjusted from" << numJobs << " to '1'." << std::endl;
+                numJobs = 1;
+            }
             // allocate memory for all amplitudes for temporal data collection
             tmp_result = new Amplitude[elements_amplitude()];
 
             /*only rank 0 create a file*/
             isMaster = reduce.hasResult(mpi::reduceMethods::Reduce());
 
-            radiation = new GridBuffer<Amplitude, DIM1 > (DataSpace<DIM1 > (elements_amplitude())); //create one int on GPU and host
+            /* Buffer for GPU results.
+             * The second dimension is used to store intermediate results if command
+             * line option numJobs is > 1.
+             */
+            radiation = new GridBuffer<Amplitude, 2> (DataSpace<2>(elements_amplitude(), numJobs));
 
             freqInit.Init(frequencies_from_list::listLocation);
             freqFkt = freqInit.getFunctor();
@@ -387,6 +399,15 @@ class Radiation : public ISimulationPlugin
   {
     radiation->deviceToHost();
     __getTransactionEvent().waitForFinished();
+
+    auto dbox = radiation->getHostBuffer().getDataBox();
+    int numAmp = elements_amplitude();
+    // update the main result matrix (y index zero)
+    for( int resultIdx = 1; resultIdx < numJobs; ++resultIdx )
+        for( int ampIdx = 0; ampIdx < numAmp; ++ampIdx )
+        {
+            dbox(DataSpace< 2 >( ampIdx, 0 ) ) += dbox(DataSpace< 2 >( ampIdx, resultIdx ) );
+        }
   }
 
 
@@ -1188,8 +1209,8 @@ class Radiation : public ISimulationPlugin
       PMACC_KERNEL( KernelRadiationParticles<
           numWorkers
       >{} )(
-          gridDim_rad,
-          numWorkers
+          DataSpace< 2 >(gridDim_rad, numJobs),
+          DataSpace< 2 >(numWorkers,1)
       )(
          /*Pointer to particles memory on the device*/
          particles->getDeviceParticlesBox(),
diff --git a/include/picongpu/plugins/radiation/Radiation.kernel b/include/picongpu/plugins/radiation/Radiation.kernel
index c81b030f21..13421f29cd 100644
--- a/include/picongpu/plugins/radiation/Radiation.kernel
+++ b/include/picongpu/plugins/radiation/Radiation.kernel
@@ -183,11 +183,13 @@ namespace radiation
             // get absolute number of relevant super cells
             int const numSuperCells = superCellsCount.productOfComponents();
 
+            int const numJobs = cupla::gridDim(acc).y;
+            int const jobIdx = cupla::blockIdx(acc).y;
 
-            /* go over all super cells on GPU
+            /* go over all super cells on GPU with a stride depending on number of temporary results
              * but ignore all guarding supercells
              */
-            for( int super_cell_index = 0; super_cell_index <= numSuperCells; ++super_cell_index )
+            for( int super_cell_index = jobIdx; super_cell_index <= numSuperCells; super_cell_index += numJobs )
             {
                 // select SuperCell and add one sided guard again
                 DataSpace< simDim > const superCell =
@@ -481,7 +483,7 @@ namespace radiation
                          *     - from this (one) time step
                          *     - omega_id = theta_idx * radiation_frequencies::N_omega + o
                          */
-                        radiation[ theta_idx * radiation_frequencies::N_omega + o] += amplitude;
+                        radiation( DataSpace< 2 >(theta_idx * radiation_frequencies::N_omega + o, jobIdx ) ) += amplitude;
 
                     } // end frequency loop