From 65aeea5f8a64dadf907e00bd5a6e5a7e2494e53f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Tue, 8 Sep 2020 09:33:08 +0200 Subject: [PATCH 01/13] fix usage of `::abs()` Use cupla math implementation of `abs()` instead of `abs` from the global scope. This change solves compile issues with HIP, found in my HIP prototype branch. --- include/picongpu/plugins/output/images/Visualisation.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/picongpu/plugins/output/images/Visualisation.hpp b/include/picongpu/plugins/output/images/Visualisation.hpp index 194233170c..969fd80832 100644 --- a/include/picongpu/plugins/output/images/Visualisation.hpp +++ b/include/picongpu/plugins/output/images/Visualisation.hpp @@ -156,7 +156,7 @@ struct typicalFields < 5 > const float_X tyEField = fields::laserProfiles::Selected::Unitless::W0 * BASE_DENSITY / 3.0f / EPS0; const float_X tyBField = tyEField * MUE0_EPS0; const float_X tyCurrent = particles::TYPICAL_PARTICLES_PER_CELL * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE - * abs(baseCharge) / DELTA_T; + * math::abs(baseCharge) / DELTA_T; return float3_X(tyBField, tyEField, tyCurrent); #endif From 1d3b24d05a37786502ec1d4dbb868769c1830657 Mon Sep 17 00:00:00 2001 From: Sergei Bastrakov Date: Mon, 14 Sep 2020 10:35:22 +0200 Subject: [PATCH 02/13] Clarify output concerning cuda_memtest not being available solves #3327 --- etc/picongpu/aris-grnet/gpu.tpl | 2 +- etc/picongpu/bash/mpiexec.tpl | 2 +- etc/picongpu/bash/mpirun.tpl | 2 +- etc/picongpu/davide-cineca/gpu.tpl | 2 +- etc/picongpu/davinci-rice/picongpu.tpl | 2 +- etc/picongpu/hemera-hzdr/fwkt_v100.tpl | 2 +- etc/picongpu/hemera-hzdr/gpu.tpl | 2 +- etc/picongpu/hemera-hzdr/k20.tpl | 2 +- etc/picongpu/hemera-hzdr/k20_restart.tpl | 2 +- etc/picongpu/hemera-hzdr/k80.tpl | 2 +- etc/picongpu/hemera-hzdr/k80_restart.tpl | 2 +- etc/picongpu/jureca-jsc/gpus.tpl | 2 +- etc/picongpu/juwels-jsc/gpus.tpl | 2 +- etc/picongpu/lawrencium-lbnl/fermi.tpl | 2 +- etc/picongpu/lawrencium-lbnl/k20.tpl | 2 +- etc/picongpu/pizdaint-cscs/large.tpl | 2 +- etc/picongpu/pizdaint-cscs/normal.tpl | 2 +- etc/picongpu/taurus-tud/V100.tpl | 2 +- etc/picongpu/taurus-tud/V100_restart.tpl | 2 +- etc/picongpu/taurus-tud/k20x.tpl | 2 +- etc/picongpu/taurus-tud/k80.tpl | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) diff --git a/etc/picongpu/aris-grnet/gpu.tpl b/etc/picongpu/aris-grnet/gpu.tpl index 440a2080b8..05ec169f0b 100644 --- a/etc/picongpu/aris-grnet/gpu.tpl +++ b/etc/picongpu/aris-grnet/gpu.tpl @@ -102,7 +102,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq # Run CUDA memtest to check GPU's health srun -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/bash/mpiexec.tpl b/etc/picongpu/bash/mpiexec.tpl index 04f509cb57..c4f58c67d8 100644 --- a/etc/picongpu/bash/mpiexec.tpl +++ b/etc/picongpu/bash/mpiexec.tpl @@ -53,7 +53,7 @@ export OMPI_MCA_io=^ompio if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then mpiexec -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/bash/mpirun.tpl b/etc/picongpu/bash/mpirun.tpl index fb6e760cd1..f9c07cdc62 100644 --- a/etc/picongpu/bash/mpirun.tpl +++ b/etc/picongpu/bash/mpirun.tpl @@ -53,7 +53,7 @@ export OMPI_MCA_io=^ompio if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then mpirun -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/davide-cineca/gpu.tpl b/etc/picongpu/davide-cineca/gpu.tpl index f4c0f50ee6..0ccc52f254 100644 --- a/etc/picongpu/davide-cineca/gpu.tpl +++ b/etc/picongpu/davide-cineca/gpu.tpl @@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq # Run CUDA memtest to check GPU's health srun --cpu-bind=sockets !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/davinci-rice/picongpu.tpl b/etc/picongpu/davinci-rice/picongpu.tpl index b4e316b2bd..765687fdab 100644 --- a/etc/picongpu/davinci-rice/picongpu.tpl +++ b/etc/picongpu/davinci-rice/picongpu.tpl @@ -81,7 +81,7 @@ export OMPI_MCA_io=^ompio if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then mpirun -n TBG_tasks --display-map -am tbg/openib.conf --mca mpi_leave_pinned 0 !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/hemera-hzdr/fwkt_v100.tpl b/etc/picongpu/hemera-hzdr/fwkt_v100.tpl index 5bc8341d9b..09058913a0 100644 --- a/etc/picongpu/hemera-hzdr/fwkt_v100.tpl +++ b/etc/picongpu/hemera-hzdr/fwkt_v100.tpl @@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq # Run CUDA memtest to check GPU's health mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/hemera-hzdr/gpu.tpl b/etc/picongpu/hemera-hzdr/gpu.tpl index 90c9b2d12c..a06c278306 100644 --- a/etc/picongpu/hemera-hzdr/gpu.tpl +++ b/etc/picongpu/hemera-hzdr/gpu.tpl @@ -101,7 +101,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq # Run CUDA memtest to check GPU's health mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/hemera-hzdr/k20.tpl b/etc/picongpu/hemera-hzdr/k20.tpl index b8992555f0..f58454e6f9 100644 --- a/etc/picongpu/hemera-hzdr/k20.tpl +++ b/etc/picongpu/hemera-hzdr/k20.tpl @@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq # Run CUDA memtest to check GPU's health mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/hemera-hzdr/k20_restart.tpl b/etc/picongpu/hemera-hzdr/k20_restart.tpl index 52b9701b07..d531615ccb 100644 --- a/etc/picongpu/hemera-hzdr/k20_restart.tpl +++ b/etc/picongpu/hemera-hzdr/k20_restart.tpl @@ -167,7 +167,7 @@ export OMPI_MCA_io=^ompio if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq !TBG_gpusPerNode ] ; then mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/hemera-hzdr/k80.tpl b/etc/picongpu/hemera-hzdr/k80.tpl index 3cfc81ea4d..9c52c96b38 100644 --- a/etc/picongpu/hemera-hzdr/k80.tpl +++ b/etc/picongpu/hemera-hzdr/k80.tpl @@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq # Run CUDA memtest to check GPU's health mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/hemera-hzdr/k80_restart.tpl b/etc/picongpu/hemera-hzdr/k80_restart.tpl index d65f9e9730..4a74804505 100644 --- a/etc/picongpu/hemera-hzdr/k80_restart.tpl +++ b/etc/picongpu/hemera-hzdr/k80_restart.tpl @@ -167,7 +167,7 @@ export OMPI_MCA_io=^ompio if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq !TBG_gpusPerNode ] ; then mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/jureca-jsc/gpus.tpl b/etc/picongpu/jureca-jsc/gpus.tpl index a6bda9a5e2..182bf90225 100644 --- a/etc/picongpu/jureca-jsc/gpus.tpl +++ b/etc/picongpu/jureca-jsc/gpus.tpl @@ -93,7 +93,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedDevicesPerNode # Run CUDA memtest to check GPU's health srun --cpu_bind=sockets !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/juwels-jsc/gpus.tpl b/etc/picongpu/juwels-jsc/gpus.tpl index 6d8c717931..c90d613289 100644 --- a/etc/picongpu/juwels-jsc/gpus.tpl +++ b/etc/picongpu/juwels-jsc/gpus.tpl @@ -93,7 +93,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedDevicesPerNode # Run CUDA memtest to check GPU's health srun --cpu_bind=sockets !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/lawrencium-lbnl/fermi.tpl b/etc/picongpu/lawrencium-lbnl/fermi.tpl index 796562c1b2..16bb7e525c 100644 --- a/etc/picongpu/lawrencium-lbnl/fermi.tpl +++ b/etc/picongpu/lawrencium-lbnl/fermi.tpl @@ -106,7 +106,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then # Run CUDA memtest to check GPU's health mpirun !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/lawrencium-lbnl/k20.tpl b/etc/picongpu/lawrencium-lbnl/k20.tpl index 76518fbd1a..a7be12dd2a 100644 --- a/etc/picongpu/lawrencium-lbnl/k20.tpl +++ b/etc/picongpu/lawrencium-lbnl/k20.tpl @@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then # Run CUDA memtest to check GPU's health mpirun !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/pizdaint-cscs/large.tpl b/etc/picongpu/pizdaint-cscs/large.tpl index 41c87ffa7b..6e6501384d 100644 --- a/etc/picongpu/pizdaint-cscs/large.tpl +++ b/etc/picongpu/pizdaint-cscs/large.tpl @@ -82,7 +82,7 @@ ln -s ../stdout output if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then srun -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/pizdaint-cscs/normal.tpl b/etc/picongpu/pizdaint-cscs/normal.tpl index 5de12231bf..abcbf7f607 100644 --- a/etc/picongpu/pizdaint-cscs/normal.tpl +++ b/etc/picongpu/pizdaint-cscs/normal.tpl @@ -87,7 +87,7 @@ export PMI_NO_PREINITIALIZE=1 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then srun -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/taurus-tud/V100.tpl b/etc/picongpu/taurus-tud/V100.tpl index 8f6dbbf922..f678e75c8d 100644 --- a/etc/picongpu/taurus-tud/V100.tpl +++ b/etc/picongpu/taurus-tud/V100.tpl @@ -107,7 +107,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then # Run CUDA memtest to check GPU's health srun -K1 !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/taurus-tud/V100_restart.tpl b/etc/picongpu/taurus-tud/V100_restart.tpl index 8e34ff0d1b..d1f07c337f 100644 --- a/etc/picongpu/taurus-tud/V100_restart.tpl +++ b/etc/picongpu/taurus-tud/V100_restart.tpl @@ -175,7 +175,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then # Run CUDA memtest to check GPU's health mpiexec -hostfile ../machinefile.txt !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/taurus-tud/k20x.tpl b/etc/picongpu/taurus-tud/k20x.tpl index d7be22efe9..17b34581f1 100644 --- a/etc/picongpu/taurus-tud/k20x.tpl +++ b/etc/picongpu/taurus-tud/k20x.tpl @@ -97,7 +97,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then # Run CUDA memtest to check GPU's health srun -K1 !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then diff --git a/etc/picongpu/taurus-tud/k80.tpl b/etc/picongpu/taurus-tud/k80.tpl index 279bdc9e43..80480f5a6c 100644 --- a/etc/picongpu/taurus-tud/k80.tpl +++ b/etc/picongpu/taurus-tud/k80.tpl @@ -97,7 +97,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then # Run CUDA memtest to check GPU's health srun -K1 !TBG_dstPath/input/bin/cuda_memtest.sh else - echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2 + echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2 fi if [ $? -eq 0 ] ; then From 700c248b4528c469236fd94e96525b5e60bf76f7 Mon Sep 17 00:00:00 2001 From: Sergei Bastrakov Date: Wed, 16 Sep 2020 15:14:39 +0200 Subject: [PATCH 03/13] Remove leftover mentions of pmacc::nvidia::rng with no actual use Fix outdated comments --- include/picongpu/particles/Particles.kernel | 4 ---- include/picongpu/particles/filter/generic/FreeRng.hpp | 2 -- include/picongpu/particles/functor/misc/Rng.hpp | 4 ---- include/picongpu/particles/manipulators/generic/FreeRng.def | 2 +- include/picongpu/particles/manipulators/generic/FreeRng.hpp | 2 -- .../picongpu/particles/manipulators/unary/RandomPosition.def | 2 +- include/picongpu/particles/manipulators/unary/Temperature.hpp | 2 +- include/picongpu/particles/startPosition/RandomImpl.hpp | 4 +--- include/picongpu/particles/startPosition/generic/FreeRng.hpp | 2 -- .../examples/FoilLCT/include/picongpu/param/particle.param | 1 - 10 files changed, 4 insertions(+), 21 deletions(-) diff --git a/include/picongpu/particles/Particles.kernel b/include/picongpu/particles/Particles.kernel index 93b0dcd09e..538189946f 100644 --- a/include/picongpu/particles/Particles.kernel +++ b/include/picongpu/particles/Particles.kernel @@ -35,10 +35,6 @@ #include #include -#include -#include -#include - #include #include #include diff --git a/include/picongpu/particles/filter/generic/FreeRng.hpp b/include/picongpu/particles/filter/generic/FreeRng.hpp index 2667644f29..dc701ebd5b 100644 --- a/include/picongpu/particles/filter/generic/FreeRng.hpp +++ b/include/picongpu/particles/filter/generic/FreeRng.hpp @@ -78,8 +78,6 @@ namespace acc T_Particle const & particle ) { - namespace nvrng = nvidia::rng; - bool const isValid = particle.isHandleValid( ); return isValid && Functor::operator()( diff --git a/include/picongpu/particles/functor/misc/Rng.hpp b/include/picongpu/particles/functor/misc/Rng.hpp index c141fb1ddc..0b40b1cf5b 100644 --- a/include/picongpu/particles/functor/misc/Rng.hpp +++ b/include/picongpu/particles/functor/misc/Rng.hpp @@ -22,8 +22,6 @@ #include "picongpu/simulation_defines.hpp" #include "picongpu/particles/functor/misc/RngWrapper.hpp" -#include -#include #include #include #include @@ -93,8 +91,6 @@ namespace misc T_WorkerCfg const & workerCfg ) const { - namespace nvrng = nvidia::rng; - RngHandle tmp( rngHandle ); tmp.init( localSupercellOffset * SuperCellSize::toRT() + diff --git a/include/picongpu/particles/manipulators/generic/FreeRng.def b/include/picongpu/particles/manipulators/generic/FreeRng.def index e11c8c2b4e..b82907f831 100644 --- a/include/picongpu/particles/manipulators/generic/FreeRng.def +++ b/include/picongpu/particles/manipulators/generic/FreeRng.def @@ -42,7 +42,7 @@ namespace generic * * example for `particle.param`: add * @code{.cpp} - * #include + * #include * * struct FunctorRandomX * { diff --git a/include/picongpu/particles/manipulators/generic/FreeRng.hpp b/include/picongpu/particles/manipulators/generic/FreeRng.hpp index adf5954b43..6bca53a296 100644 --- a/include/picongpu/particles/manipulators/generic/FreeRng.hpp +++ b/include/picongpu/particles/manipulators/generic/FreeRng.hpp @@ -81,8 +81,6 @@ namespace acc T_Args && ... args ) { - namespace nvrng = nvidia::rng; - Functor::operator()( m_rng, particle, diff --git a/include/picongpu/particles/manipulators/unary/RandomPosition.def b/include/picongpu/particles/manipulators/unary/RandomPosition.def index 91868f5047..0186f419ba 100644 --- a/include/picongpu/particles/manipulators/unary/RandomPosition.def +++ b/include/picongpu/particles/manipulators/unary/RandomPosition.def @@ -46,7 +46,7 @@ namespace acc { /** set in-cell position * - * @tparam T_Rng pmacc::nvidia::rng::RNG, type of the random number generator + * @tparam T_Rng functor::misc::RngWrapper, type of the random number generator * @tparam T_Particle pmacc::Particle, particle type * @tparam T_Args pmacc::Particle, arbitrary number of particles types * diff --git a/include/picongpu/particles/manipulators/unary/Temperature.hpp b/include/picongpu/particles/manipulators/unary/Temperature.hpp index 6d2c0a8fe4..4eff7cb3f1 100644 --- a/include/picongpu/particles/manipulators/unary/Temperature.hpp +++ b/include/picongpu/particles/manipulators/unary/Temperature.hpp @@ -48,7 +48,7 @@ namespace acc { /** manipulate the speed of the particle * - * @tparam T_Rng pmacc::nvidia::rng::RNG, type of the random number generator + * @tparam T_Rng functor::misc::RngWrapper, type of the random number generator * @tparam T_Particle pmacc::Particle, particle type * @tparam T_Args pmacc::Particle, arbitrary number of particles types * diff --git a/include/picongpu/particles/startPosition/RandomImpl.hpp b/include/picongpu/particles/startPosition/RandomImpl.hpp index 4727d54aee..75d409b907 100644 --- a/include/picongpu/particles/startPosition/RandomImpl.hpp +++ b/include/picongpu/particles/startPosition/RandomImpl.hpp @@ -24,8 +24,6 @@ #include "picongpu/particles/startPosition/generic/FreeRng.def" #include "picongpu/particles/startPosition/detail/WeightMacroParticles.hpp" -#include - #include @@ -43,7 +41,7 @@ namespace acc { /** set in-cell position and weighting * - * @tparam T_Rng pmacc::nvidia::rng::RNG, type of the random number generator + * @tparam T_Rng functor::misc::RngWrapper, type of the random number generator * @tparam T_Particle pmacc::Particle, particle type * @tparam T_Args pmacc::Particle, arbitrary number of particles types * diff --git a/include/picongpu/particles/startPosition/generic/FreeRng.hpp b/include/picongpu/particles/startPosition/generic/FreeRng.hpp index c7a37b952d..cdc358e55c 100644 --- a/include/picongpu/particles/startPosition/generic/FreeRng.hpp +++ b/include/picongpu/particles/startPosition/generic/FreeRng.hpp @@ -80,8 +80,6 @@ namespace acc T_Args && ... args ) { - namespace nvrng = nvidia::rng; - Functor::operator()( m_rng, particle, diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param index 8ac1a6bc87..b57d4aad56 100644 --- a/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param +++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param @@ -34,7 +34,6 @@ #include #include -#include namespace picongpu From 5c59aba005621c7f2e00dc0f34583678c292bb18 Mon Sep 17 00:00:00 2001 From: Sergei Bastrakov Date: Wed, 16 Sep 2020 15:15:11 +0200 Subject: [PATCH 04/13] Change the game of life example to use pmacc::random tools instead of pmacc::nvidia::rng --- .../gameOfLife2D/include/Evolution.hpp | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp index 5e3affa0ea..2fd631723e 100644 --- a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp +++ b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp @@ -28,9 +28,9 @@ #include #include #include -#include -#include -#include +#include +#include +#include #include #include #include @@ -199,11 +199,15 @@ namespace kernel blockCell + DataSpaceOperations< DIM2 >::template map< SuperCellSize >( workerIdx ) ); - // get uniform random number from seed - auto rng = nvidia::rng::create( - nvidia::rng::methods::Xor< T_Acc >( acc, seed, globalUniqueId ), - nvidia::rng::distributions::Uniform_float::get( acc ) - ); + // create a random number state and generator + using RngMethod = random::methods::XorMin< T_Acc >; + using State = typename RngMethod::StateType; + State state; + RngMethod method; + method.init( acc, state, seed, globalUniqueId ); + using Distribution = random::distributions::Uniform< float, RngMethod >; + using Random = random::Random< Distribution, RngMethod, State* >; + Random rng( &state ); ForEachIdx< IdxConfig< @@ -219,7 +223,7 @@ namespace kernel // cell index within the superCell DataSpace< DIM2 > const cellIdx = DataSpaceOperations< DIM2 >::template map< SuperCellSize >( linearIdx ); // write 1(white) if uniform random number 0( rng() <= threshold ); + buffWrite( blockCell + cellIdx ) = static_cast< bool >( rng( acc ) <= threshold ); } ); } From f2c32136db7028388f3a2aba34bd30b5dbec6394 Mon Sep 17 00:00:00 2001 From: Sergei Bastrakov Date: Wed, 16 Sep 2020 14:38:09 +0200 Subject: [PATCH 05/13] Remove pmacc/nvidia/rng/* as all its contents are never used These were outdated tools, with pmacc::random providing the modern counterparts --- include/pmacc/nvidia/rng/RNG.hpp | 94 ------------------- .../nvidia/rng/distributions/Normal_float.hpp | 80 ---------------- .../rng/distributions/Uniform_float.hpp | 93 ------------------ .../rng/distributions/Uniform_int32.hpp | 83 ---------------- include/pmacc/nvidia/rng/methods/Xor.hpp | 75 --------------- 5 files changed, 425 deletions(-) delete mode 100644 include/pmacc/nvidia/rng/RNG.hpp delete mode 100644 include/pmacc/nvidia/rng/distributions/Normal_float.hpp delete mode 100644 include/pmacc/nvidia/rng/distributions/Uniform_float.hpp delete mode 100644 include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp delete mode 100644 include/pmacc/nvidia/rng/methods/Xor.hpp diff --git a/include/pmacc/nvidia/rng/RNG.hpp b/include/pmacc/nvidia/rng/RNG.hpp deleted file mode 100644 index 2a47f857c9..0000000000 --- a/include/pmacc/nvidia/rng/RNG.hpp +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright 2013-2020 Heiko Burau, Rene Widera - * - * This file is part of PMacc. - * - * PMacc is free software: you can redistribute it and/or modify - * it under the terms of either the GNU General Public License or - * the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * PMacc is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License and the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License - * and the GNU Lesser General Public License along with PMacc. - * If not, see . - */ - - -#pragma once - -#include "pmacc/types.hpp" - -namespace pmacc -{ -namespace nvidia -{ -namespace rng -{ - - /* create a random number generator on gpu - * \tparam RngMethod method to generate random number - * \tparam Distribution functor for distribution - */ - template - class RNG : public RNGMethod - { - public: - - typedef RNGMethod MethodType; - typedef Distribution DistributionType; - typedef RNG This; - - HDINLINE RNG() - { - } - - /* - * \param rngMethod instance of generator - * \param distribution instance of distribution functor - */ - DINLINE RNG(const RNGMethod& rng_method, const Distribution& rng_operation) : - RNGMethod(rng_method), op(rng_operation) - { - } - - HDINLINE RNG(const This& other) : - RNGMethod(static_cast(other)), op(other.op) - { - } - - /* default method to generate a random number - * @return random number - */ - DINLINE typename Distribution::Type operator()() - { - return this->op(this->getState()); - } - - private: - PMACC_ALIGN(op, Distribution); - }; - - /* create a random number generator on gpu - * \tparam RngMethod method to generate random number - * \tparam Distribution functor for distribution - * - * \param rngMethod instance of generator - * \param distribution instance of distribution functor - * \return class which can used to generate random numbers - */ - template - DINLINE typename pmacc::nvidia::rng::RNG create(const RngMethod & rngMethod, - const Distribution & distribution) - { - return pmacc::nvidia::rng::RNG(rngMethod, distribution); - } - -} // namespace rng -} // namespace nvidia -} // namespace pmacc diff --git a/include/pmacc/nvidia/rng/distributions/Normal_float.hpp b/include/pmacc/nvidia/rng/distributions/Normal_float.hpp deleted file mode 100644 index 2f09df70a7..0000000000 --- a/include/pmacc/nvidia/rng/distributions/Normal_float.hpp +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright 2013-2020 Heiko Burau, Rene Widera - * - * This file is part of PMacc. - * - * PMacc is free software: you can redistribute it and/or modify - * it under the terms of either the GNU General Public License or - * the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * PMacc is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License and the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License - * and the GNU Lesser General Public License along with PMacc. - * If not, see . - */ - - -#pragma once - -#include "pmacc/types.hpp" - -namespace pmacc -{ -namespace nvidia -{ -namespace rng -{ -namespace distributions -{ -namespace detail -{ - /*Return normally distributed floats with mean 0.0f and standard deviation 1.0f - */ - template< typename T_Acc> - class Normal_float - { - public: - typedef float Type; - private: - using Dist = - decltype( - ::alpaka::rand::distribution::createNormalReal( - alpaka::core::declval())); - PMACC_ALIGN(dist, Dist); - public: - HDINLINE Normal_float() - { - } - - HDINLINE Normal_float(const T_Acc& acc) : dist(::alpaka::rand::distribution::createNormalReal(acc)) - { - } - - template - DINLINE Type operator()(RNGState& state) - { - return dist(state); - } - - }; -} // namespace detail - - struct Normal_float - { - template< typename T_Acc> - static HDINLINE detail::Normal_float< T_Acc > - get( T_Acc const & acc) - { - return detail::Normal_float< T_Acc >( acc ); - } - }; -} // namespace distributions -} // namespace rng -} // namespace nvidia -} // namespace pmacc diff --git a/include/pmacc/nvidia/rng/distributions/Uniform_float.hpp b/include/pmacc/nvidia/rng/distributions/Uniform_float.hpp deleted file mode 100644 index b757c9f04a..0000000000 --- a/include/pmacc/nvidia/rng/distributions/Uniform_float.hpp +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera - * - * This file is part of PMacc. - * - * PMacc is free software: you can redistribute it and/or modify - * it under the terms of either the GNU General Public License or - * the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * PMacc is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License and the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License - * and the GNU Lesser General Public License along with PMacc. - * If not, see . - */ - - -#pragma once - -#include "pmacc/types.hpp" - -namespace pmacc -{ -namespace nvidia -{ -namespace rng -{ -namespace distributions -{ -namespace detail -{ - /*create a random float number from [0.0,1.0) - */ - template< typename T_Acc> - class Uniform_float - { - public: - typedef float Type; - private: - using Dist = - decltype( - ::alpaka::rand::distribution::createUniformReal( - alpaka::core::declval())); - PMACC_ALIGN(dist, Dist); - public: - - HDINLINE Uniform_float() - { - } - - HDINLINE Uniform_float(const T_Acc& acc) : dist(::alpaka::rand::distribution::createUniformReal(acc)) - { - } - - template - DINLINE Type operator()(RNGState& state) - { - // (0.f, 1.0f] - const Type raw = dist(state); - - /// \warn hack, are is that really ok? I say, yes, since - /// it shifts just exactly one number. Axel - /// - /// Note: (1.0f - raw) does not work, since - /// nvidia seems to return denormalized - /// floats around 0.f (thats not as they - /// state it out in their documentation) - // [0.f, 1.0f) - const Type r = raw * static_cast( raw != Type(1.0) ); - return r; - } - - }; -} // namespace detail - - struct Uniform_float - { - template< typename T_Acc> - static HDINLINE detail::Uniform_float< T_Acc > - get( T_Acc const & acc) - { - return detail::Uniform_float< T_Acc >( acc ); - } - }; -} // namespace distributions -} // namespace rng -} // namespace nvidia -} // namespace pmacc diff --git a/include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp b/include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp deleted file mode 100644 index e0569ecb26..0000000000 --- a/include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera - * - * This file is part of PMacc. - * - * PMacc is free software: you can redistribute it and/or modify - * it under the terms of either the GNU General Public License or - * the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * PMacc is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License and the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License - * and the GNU Lesser General Public License along with PMacc. - * If not, see . - */ - - -#pragma once - -#include "pmacc/types.hpp" - -namespace pmacc -{ -namespace nvidia -{ -namespace rng -{ -namespace distributions -{ -namespace detail -{ - /*create a 32Bit random int number - * Range: [INT_MIN,INT_MAX] - */ - template< typename T_Acc> - class Uniform_int32 - { - public: - typedef int32_t Type; - - private: - typedef uint32_t RngType; - using Dist = - decltype( - ::alpaka::rand::distribution::createUniformUint( - alpaka::core::declval())); - PMACC_ALIGN(dist, Dist); - public: - HDINLINE Uniform_int() - { - } - - HDINLINE Uniform_int(const T_Acc& acc) : dist(::alpaka::rand::distribution::createUniformUint(acc)) - { - } - - template - DINLINE Type operator()(RNGState& state) - { - /*curand create a random 32Bit int value*/ - return static_cast(dist(state)); - } - }; -} // namespace detail - - struct Normal_float - { - template< typename T_Acc> - static HDINLINE detail::Uniform_int32< T_Acc > - get( T_Acc const & acc) - { - return detail::Uniform_int32< T_Acc >( acc ); - } - }; -} // namespace distributions -} // namespace rng -} // namespace nvidia -} // namespace pmacc diff --git a/include/pmacc/nvidia/rng/methods/Xor.hpp b/include/pmacc/nvidia/rng/methods/Xor.hpp deleted file mode 100644 index f48e6c3714..0000000000 --- a/include/pmacc/nvidia/rng/methods/Xor.hpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright 2013-2020 Heiko Burau, Rene Widera - * - * This file is part of PMacc. - * - * PMacc is free software: you can redistribute it and/or modify - * it under the terms of either the GNU General Public License or - * the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * PMacc is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License and the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License - * and the GNU Lesser General Public License along with PMacc. - * If not, see . - */ - - -#pragma once - -#include "pmacc/types.hpp" - -namespace pmacc -{ -namespace nvidia -{ -namespace rng -{ -namespace methods -{ - - template< typename T_Acc > - class Xor - { - private: - using Gen = - decltype( - ::alpaka::rand::generator::createDefault( - alpaka::core::declval(), - alpaka::core::declval(), - alpaka::core::declval())); - PMACC_ALIGN(gen, Gen); - public: - typedef Gen StateType; - typedef T_Acc Acc; - - HDINLINE Xor() : gen (0) - { - } - - DINLINE Xor(const T_Acc& acc, uint32_t seed, uint32_t subsequence = 0) - { - gen = ::alpaka::rand::generator::createDefault(acc, seed, subsequence); - } - - HDINLINE Xor(const Xor& other): gen(other.gen) - { - - } - - protected: - - DINLINE StateType& getState() - { - return gen; - } - }; -} // namespace methods -} // namespace rng -} // namespace nvidia -} // namespace pmacc From 4e57dcac8e863daf9803fda24b8c2b738626a990 Mon Sep 17 00:00:00 2001 From: Sergei Bastrakov Date: Wed, 16 Sep 2020 16:06:58 +0200 Subject: [PATCH 06/13] Set default values for some PMacc game of life example arguments --- share/pmacc/examples/gameOfLife2D/main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/share/pmacc/examples/gameOfLife2D/main.cpp b/share/pmacc/examples/gameOfLife2D/main.cpp index d33dc5abaa..fb9a4e4f10 100644 --- a/share/pmacc/examples/gameOfLife2D/main.cpp +++ b/share/pmacc/examples/gameOfLife2D/main.cpp @@ -41,7 +41,7 @@ int main( int argc, char **argv ) { typedef ::gol::Space Space; - std::vector devices; /* will be set by boost program argument option "-d 3 3 3" */ + std::vector devices; /* will be set by boost program argument option "-d 3 3" */ std::vector gridSize; /* same but with -g */ std::vector periodic; uint32_t steps; @@ -50,8 +50,8 @@ int main( int argc, char **argv ) po::options_description desc( "Allowed options" ); desc.add_options( ) ( "help,h", "produce help message" ) - ( "steps,s", po::value ( &steps ), "simulation steps" ) - ( "rule,r", po::value ( &rule ), "simulation rule etc. 23/3" ) + ( "steps,s", po::value ( &steps )->default_value( 100 ), "simulation steps" ) + ( "rule,r", po::value ( &rule )->default_value( "23/3" ), "simulation rule" ) ( "devices,d", po::value > ( &devices )->multitoken( ), "number of devices in each dimension (only 1D or 2D). If you use more than " "one device in total, you will need to run mpirun with \"mpirun -n " From 8ae41f0c2ecd0d4858d14ed6fc5db60220024549 Mon Sep 17 00:00:00 2001 From: Sergei Bastrakov Date: Mon, 14 Sep 2020 17:50:57 +0200 Subject: [PATCH 07/13] Add generic utilities to get absorber thickness These are to be used in the upcoming incident field implementation --- .../fields/MaxwellSolver/YeePML/YeePML.hpp | 3 +- include/picongpu/fields/absorber/Absorber.hpp | 119 ++++++++++++++++++ 2 files changed, 121 insertions(+), 1 deletion(-) diff --git a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp index 5cc787c575..8698a2279c 100644 --- a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp +++ b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp @@ -223,7 +223,8 @@ namespace maxwellSolver Thickness globalThickness; for( uint32_t axis = 0u; axis < simDim; axis++ ) for( auto direction = 0; direction < 2; direction++ ) - globalThickness( axis, direction ) = absorber::numCells[ axis ][ direction ]; + globalThickness( axis, direction ) = + absorber::getGlobalThickness()( axis, direction ); return globalThickness; } diff --git a/include/picongpu/fields/absorber/Absorber.hpp b/include/picongpu/fields/absorber/Absorber.hpp index 5357459117..75f1b60e42 100644 --- a/include/picongpu/fields/absorber/Absorber.hpp +++ b/include/picongpu/fields/absorber/Absorber.hpp @@ -166,6 +166,10 @@ namespace detail using Absorber = detail::Absorber< Solver >; /** Number of absorber cells along each boundary + * + * Stores the global absorber thickness in case the absorbing boundary + * conditions are used along each boundary. Note that in case of periodic + * boundaries the corresponding values will be ignored. * * Is uniform for both PML and exponential damping absorbers. * First index: 0 = x, 1 = y, 2 = z. @@ -178,6 +182,121 @@ namespace detail { Absorber::zNegativeNumCells, Absorber::zPositiveNumCells } }; + //! Thickness of the absorbing layer + class Thickness + { + public: + + //! Create a zero thickness + Thickness() + { + for( uint32_t axis = 0u; axis < 3u; axis++ ) + for( uint32_t direction = 0u; direction < 2u; direction++ ) + (*this)( axis, direction ) = 0u; + } + + /** Get thickness for the given boundary + * + * @param axis axis, 0 = x, 1 = y, 2 = z + * @param direction direction, 0 = negative (min coordinate), + * 1 = positive (max coordinate) + */ + uint32_t operator()( + uint32_t const axis, + uint32_t const direction + ) const + { + return numCells[ axis ][ direction ]; + } + + /** Get reference to thickness for the given boundary + * + * @param axis axis, 0 = x, 1 = y, 2 = z + * @param direction direction, 0 = negative (min coordinate), + * 1 = positive (max coordinate) + */ + uint32_t & operator()( + uint32_t const axis, + uint32_t const direction + ) + { + return numCells[ axis ][ direction ]; + } + + private: + + /** Number of absorber cells along each boundary + * + * First index: 0 = x, 1 = y, 2 = z. + * Second index: 0 = negative (min coordinate), 1 = positive (max coordinate). + */ + uint32_t numCells[ 3 ][ 2 ]; + + }; + + /** Get absorber thickness in number of cells for the global domain + * + * This function takes into account which boundaries are periodic and + * absorbing. + */ + inline Thickness getGlobalThickness() + { + Thickness thickness; + for( uint32_t axis = 0u; axis < 3u; axis++ ) + for( uint32_t direction = 0u; direction < 2u; direction++ ) + thickness( axis, direction ) = numCells[ axis ][ direction ]; + const DataSpace< DIM3 > isPeriodicBoundary = + Environment::get().EnvironmentController().getCommunicator().getPeriodic(); + for( uint32_t axis = 0u; axis < 3u; axis++ ) + if( isPeriodicBoundary[ axis ] ) + { + thickness( axis, 0 ) = 0u; + thickness( axis, 1 ) = 0u; + } + return thickness; + } + + /** Get absorber thickness in number of cells for the current local domain + * + * This function takes into account the current domain decomposition and + * which boundaries are periodic and absorbing. + * + * Note that unlike getGlobalThickness() result which does not change + * throughout the simulation, the local thickness can change. Thus, + * the result of this function should not be reused on another time step, + * but rather the function called again. + */ + inline Thickness getLocalThickness() + { + Thickness thickness = getGlobalThickness(); + auto const numExchanges = NumberOfExchanges< simDim >::value; + auto const communicationMask = Environment< simDim >::get( ).GridController( ).getCommunicationMask( ); + for( uint32_t exchange = 1u; exchange < numExchanges; exchange++ ) + { + /* Here we are only interested in the positive and negative + * directions for x, y, z axes and not the "diagonal" ones. + * So skip other directions except left, right, top, bottom, + * back, front + */ + if( FRONT % exchange != 0 ) + continue; + + // Transform exchange into a pair of axis and direction + uint32_t axis = 0; + if( exchange >= BOTTOM && exchange <= TOP ) + axis = 1; + if( exchange >= BACK ) + axis = 2; + uint32_t direction = exchange % 2; + + // No absorber at the borders between two local domains + bool hasNeighbour = communicationMask.isSet( exchange ); + if( hasNeighbour ) + thickness( axis, direction ) = 0u; + } + return thickness; + } + namespace detail { From 8d0af9b9420f47175dfd98b2ff74b84925411095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Tue, 8 Sep 2020 09:21:48 +0200 Subject: [PATCH 08/13] PMacc warp and atomic function refactor - introduce support for HIP - change the way how `atomicAllInc` is implemented if no accelerator is given - use alpaka intrinsics and warp functions --- include/pmacc/nvidia/atomic.hpp | 317 ++++++++++++++++---------------- include/pmacc/nvidia/warp.hpp | 10 +- 2 files changed, 166 insertions(+), 161 deletions(-) diff --git a/include/pmacc/nvidia/atomic.hpp b/include/pmacc/nvidia/atomic.hpp index 1988da4d9a..13dba45f35 100644 --- a/include/pmacc/nvidia/atomic.hpp +++ b/include/pmacc/nvidia/atomic.hpp @@ -21,182 +21,179 @@ #pragma once - #include "pmacc/types.hpp" -#if( PMACC_CUDA_ENABLED == 1 ) -# include "pmacc/nvidia/warp.hpp" -#endif +#include "pmacc/memory/Array.hpp" +#include "pmacc/nvidia/warp.hpp" + +#include +#include + #include + +#include #include namespace pmacc { namespace nvidia +{ +namespace detail { - namespace detail { - - template - struct AtomicAllInc + template + struct AtomicAllInc + { + template< typename T_Acc, typename T_Hierarchy > + HDINLINE T_Type + operator()(const T_Acc& acc, T_Type* ptr, const T_Hierarchy& hierarchy) { - template< typename T_Acc, typename T_Hierarchy > - HDINLINE T_Type - operator()(const T_Acc& acc, T_Type* ptr, const T_Hierarchy& hierarchy) - { - return ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc, ptr, T_Type(1), hierarchy); - } + return ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc, ptr, T_Type(1), hierarchy); + } + }; + +#if CUPLA_DEVICE_COMPILE == 1 + /** + * Trait that returns whether an optimized version of AtomicAllInc + * exists for Kepler architectures (and up) + */ + template + struct AtomicAllIncIsOptimized + { + enum{ + value = boost::is_same::value || + boost::is_same::value || + boost::is_same::value || + boost::is_same::value || + boost::is_same::value }; - -#if PMACC_CUDA_ARCH >= 300 - /** - * Trait that returns whether an optimized version of AtomicAllInc - * exists for Kepler architectures (and up) - */ - template - struct AtomicAllIncIsOptimized + }; + + /** + * AtomicAllInc for Kepler and up + * Defaults to unoptimized version for unsupported types + */ + template::value> + struct AtomicAllIncKepler: public AtomicAllInc + {}; + + /** + * Optimized version + * + * This warp aggregated atomic increment implementation based on nvidia parallel forall example + * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ + * (author: Andrew Adinetz, date: October 1th, 2014) + * + */ + template + struct AtomicAllIncKepler + { + template< typename T_Acc, typename T_Hierarchy > + HDINLINE T_Type + operator()(const T_Acc& acc,T_Type* ptr, const T_Hierarchy& hierarchy) { - enum{ - value = boost::is_same::value || - boost::is_same::value || - boost::is_same::value || - boost::is_same::value || - boost::is_same::value - }; - }; - - /** - * AtomicAllInc for Kepler and up - * Defaults to unoptimized version for unsupported types - */ - template::value> - struct AtomicAllIncKepler: public AtomicAllInc - {}; - - /** - * Optimized version - * - * This warp aggregated atomic increment implementation based on nvidia parallel forall example - * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ - * (author: Andrew Adinetz, date: October 1th, 2014) + const auto mask = alpaka::warp::activemask(acc); + const auto leader = alpaka::intrinsic::ffs(acc, static_cast>(mask)) - 1; + + T_Type result; + const int laneId = getLaneId(); + /* Get the start value for this warp */ + if (laneId == leader) + result = ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc,ptr, static_cast(alpaka::intrinsic::popcount(acc, mask)), hierarchy); + result = warpBroadcast(result, leader); + /* Add offset per thread */ + return result + static_cast(alpaka::intrinsic::popcount(acc, mask & (( static_cast(1u) << laneId) - 1u))); + + } + }; + + /** + * Optimized version for int64. + * As CUDA atomicAdd does not support int64 directly we just cast it + * and call the uint64 implementation + */ + template<> + struct AtomicAllIncKepler + { + template< typename T_Acc, typename T_Hierarchy > + HDINLINE long long int + operator()(const T_Acc& acc, long long int* ptr, const T_Hierarchy&, const T_Hierarchy& hierarchy ) + { + return static_cast( + AtomicAllIncKepler()( + acc, + reinterpret_cast(ptr), + hierarchy + ) + ); + } + }; + + template + struct AtomicAllInc: public AtomicAllIncKepler + {}; +#endif // CUPLA_DEVICE_COMPILE == 1 + +} // namespace detail + + /** optimized atomic increment + * + * - only optimized if PTX ISA >=3.0 + * - this atomic uses warp aggregation to speedup the operation compared to cuda `atomicInc()` + * - cuda `atomicAdd()` is used if the compute architecture does not support warp aggregation + * - all participate threads must change the same pointer (ptr) else the result is unspecified + * + * @param ptr pointer to memory (must be the same address for all threads in a block) + * + */ + template + HDINLINE + T atomicAllInc(const T_Acc& acc, T *ptr, const T_Hierarchy& hierarchy) + { + return detail::AtomicAllInc= 300 || BOOST_COMP_HIP) >()(acc, ptr, hierarchy); + } + + template + HDINLINE + T atomicAllInc(T *ptr) + { + /* Dirty hack to call an alpaka accelerator based function. + * Members of the fakeAcc will be uninitialized and must not be accessed. * + * The id provider for particles is the only code where atomicAllInc is used without an accelerator. + * @todo remove the unsafe faked accelerator */ - template - struct AtomicAllIncKepler - { - template< typename T_Acc, typename T_Hierarchy > - HDINLINE T_Type - operator()(const T_Acc& acc,T_Type* ptr, const T_Hierarchy& hierarchy) - { - /* Get a bitmask with 1 for each thread in the warp, that executes this */ -#if(__CUDACC_VER_MAJOR__ >= 9) - const int mask = __activemask(); -#else - const int mask = __ballot(1); + pmacc::memory::Array fakeAcc; + return atomicAllInc(fakeAcc[0], ptr, ::alpaka::hierarchy::Grids()); + } + + /** optimized atomic value exchange + * + * - only optimized if PTX ISA >=2.0 + * - this atomic uses warp vote function to speedup the operation + * compared to cuda `atomicExch()` + * - cuda `atomicExch()` is used if the compute architecture not supports + * warps vote functions + * - all participate threads must change the same + * pointer (ptr) and set the same value, else the + * result is unspecified + * + * @param ptr pointer to memory (must be the same address for all threads in a block) + * @param value new value (must be the same for all threads in a block) + */ + template + DINLINE void + atomicAllExch(const T_Acc& acc, T_Type* ptr, const T_Type value, const T_Hierarchy& hierarchy) + { + + const auto mask = alpaka::warp::activemask(acc); + const auto leader = alpaka::intrinsic::ffs(acc, static_cast>(mask)) - 1; + +#if CUPLA_DEVICE_COMPILE == 1 + if (getLaneId() == leader) #endif - /* select the leader */ - const int leader = __ffs(mask) - 1; - T_Type result; - const int laneId = getLaneId(); - /* Get the start value for this warp */ - if (laneId == leader) - result = ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc,ptr, static_cast(__popc(mask)), hierarchy); - result = warpBroadcast(result, leader); - /* Add offset per thread */ - return result + static_cast(__popc(mask & ((1 << laneId) - 1))); - } - }; - - /** - * Optimized version for int64. - * As CUDA atomicAdd does not support int64 directly we just cast it - * and call the uint64 implementation - */ - template<> - struct AtomicAllIncKepler - { - template< typename T_Acc, typename T_Hierarchy > - HDINLINE long long int - operator()(const T_Acc& acc, long long int* ptr, const T_Hierarchy&, const T_Hierarchy& hierarchy ) - { - return static_cast( - AtomicAllIncKepler()( - acc, - reinterpret_cast(ptr), - hierarchy - ) - ); - } - }; - - template - struct AtomicAllInc: public AtomicAllIncKepler - {}; -#endif /* PMACC_CUDA_ARCH >= 300 */ - - } // namespace detail - -/** optimized atomic increment - * - * - only optimized if PTX ISA >=3.0 - * - this atomic uses warp aggregation to speedup the operation compared to cuda `atomicInc()` - * - cuda `atomicAdd()` is used if the compute architecture does not support warp aggregation - * - all participate threads must change the same pointer (ptr) else the result is unspecified - * - * @param ptr pointer to memory (must be the same address for all threads in a block) - * - */ -template -HDINLINE -T atomicAllInc(const T_Acc& acc, T *ptr, const T_Hierarchy& hierarchy) -{ - return detail::AtomicAllInc= 300) >()(acc, ptr, hierarchy); -} - -template -HDINLINE -T atomicAllInc(T *ptr) -{ -#ifdef __CUDA_ARCH__ - return atomicAllInc(alpaka::atomic::AtomicUniformCudaHipBuiltIn(), ptr, ::alpaka::hierarchy::Grids()); -#else - // assume that we can use the standard library atomics if we are not on gpu - return atomicAllInc(alpaka::atomic::AtomicStdLibLock<16>(), ptr, ::alpaka::hierarchy::Grids()); -#endif -} - -/** optimized atomic value exchange - * - * - only optimized if PTX ISA >=2.0 - * - this atomic uses warp vote function to speedup the operation - * compared to cuda `atomicExch()` - * - cuda `atomicExch()` is used if the compute architecture not supports - * warps vote functions - * - all participate threads must change the same - * pointer (ptr) and set the same value, else the - * result is unspecified - * - * @param ptr pointer to memory (must be the same address for all threads in a block) - * @param value new value (must be the same for all threads in a block) - */ -template -DINLINE void -atomicAllExch(const T_Acc& acc, T_Type* ptr, const T_Type value, const T_Hierarchy& hierarchy) -{ -#if (__CUDA_ARCH__ >= 200) -# if(__CUDACC_VER_MAJOR__ >= 9) - const int mask = __activemask(); -# else - const int mask = __ballot(1); -# endif - // select the leader - const int leader = __ffs(mask) - 1; - // leader does the update - if (getLaneId() == leader) -#endif - ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Exch>(acc, ptr, value, hierarchy); -} - + ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Exch>(acc, ptr, value, hierarchy); + } } //namespace nvidia } //namespace pmacc diff --git a/include/pmacc/nvidia/warp.hpp b/include/pmacc/nvidia/warp.hpp index 48c58dbad6..da120de01b 100644 --- a/include/pmacc/nvidia/warp.hpp +++ b/include/pmacc/nvidia/warp.hpp @@ -21,6 +21,7 @@ #pragma once +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) #include "pmacc/types.hpp" @@ -42,10 +43,15 @@ DINLINE uint32_t getLaneId() asm("mov.u32 %0, %%laneid;" : "=r" (id)); return id; } +#elif BOOST_COMP_HIP +DINLINE uint32_t getLaneId() +{ + return __lane_id(); +} #endif -#if (__CUDA_ARCH__ >= 300) +#if (__CUDA_ARCH__ >= 300 || BOOST_COMP_HIP) /** broadcast data within a warp * * required PTX ISA >=3.0 @@ -114,3 +120,5 @@ DINLINE double warpBroadcast(double data, const int32_t srcLaneId) } //namespace nvidia } //namespace pmacc + +#endif From e47ea6121d0bc0a9ad320f6217f01aa7ce369d57 Mon Sep 17 00:00:00 2001 From: Sergei Bastrakov Date: Thu, 17 Sep 2020 09:42:25 +0200 Subject: [PATCH 09/13] Replace usage of pmacc::memory::makeUnique with C++14 std::make_unique --- include/picongpu/fields/EMFieldBase.tpp | 3 +-- include/picongpu/fields/FieldJ.tpp | 3 +-- include/picongpu/fields/FieldTmp.tpp | 5 ++--- include/picongpu/particles/ParticlesFunctors.hpp | 3 +-- include/picongpu/particles/flylite/NonLTE.tpp | 13 +++++-------- .../plugins/xrayScattering/XrayScattering.hpp | 7 +++---- .../plugins/xrayScattering/XrayScatteringWriter.hpp | 4 ++-- .../picongpu/simulation/control/MySimulation.hpp | 13 ++++++------- include/pmacc/memory/buffers/ExchangeIntern.hpp | 13 ++++++------- .../examples/gameOfLife2D/include/Evolution.hpp | 3 +-- 10 files changed, 28 insertions(+), 39 deletions(-) diff --git a/include/picongpu/fields/EMFieldBase.tpp b/include/picongpu/fields/EMFieldBase.tpp index 1371951691..9c3f56aa44 100644 --- a/include/picongpu/fields/EMFieldBase.tpp +++ b/include/picongpu/fields/EMFieldBase.tpp @@ -35,7 +35,6 @@ #include #include #include -#include #include #include @@ -58,7 +57,7 @@ namespace fields SimulationFieldHelper< MappingDesc >( cellDescription ), id( id ) { - buffer = pmacc::memory::makeUnique< Buffer >( + buffer = std::make_unique< Buffer >( cellDescription.getGridLayout( ) ); diff --git a/include/picongpu/fields/FieldJ.tpp b/include/picongpu/fields/FieldJ.tpp index 20272f1537..d3ec7f6453 100644 --- a/include/picongpu/fields/FieldJ.tpp +++ b/include/picongpu/fields/FieldJ.tpp @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -127,7 +126,7 @@ FieldJ::FieldJ( MappingDesc const & cellDescription ) : if( originRecvGuard != DataSpace::create(0) || endRecvGuard != DataSpace::create(0) ) { - fieldJrecv = pmacc::memory::makeUnique< GridBuffer >( + fieldJrecv = std::make_unique< GridBuffer >( buffer.getDeviceBuffer(), cellDescription.getGridLayout( ) ); diff --git a/include/picongpu/fields/FieldTmp.tpp b/include/picongpu/fields/FieldTmp.tpp index a6f3d718d6..12d28976c7 100644 --- a/include/picongpu/fields/FieldTmp.tpp +++ b/include/picongpu/fields/FieldTmp.tpp @@ -27,7 +27,6 @@ #include "picongpu/particles/traits/GetInterpolation.hpp" #include -#include #include #include #include @@ -66,10 +65,10 @@ namespace picongpu m_commTagGather = pmacc::traits::getNextId( ) + SPECIES_FIRSTTAG; using Buffer = GridBuffer< ValueType, simDim >; - fieldTmp = memory::makeUnique< Buffer >( cellDescription.getGridLayout( ) ); + fieldTmp = std::make_unique< Buffer >( cellDescription.getGridLayout( ) ); if( fieldTmpSupportGatherCommunication ) - fieldTmpRecv = memory::makeUnique< Buffer >( + fieldTmpRecv = std::make_unique< Buffer >( fieldTmp->getDeviceBuffer(), cellDescription.getGridLayout( ) ); diff --git a/include/picongpu/particles/ParticlesFunctors.hpp b/include/picongpu/particles/ParticlesFunctors.hpp index ae2e5140c0..103aa436f9 100644 --- a/include/picongpu/particles/ParticlesFunctors.hpp +++ b/include/picongpu/particles/ParticlesFunctors.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include "picongpu/particles/traits/GetIonizerList.hpp" #if( PMACC_CUDA_ENABLED == 1 ) @@ -99,7 +98,7 @@ struct CreateSpecies { DataConnector &dc = Environment<>::get().DataConnector(); dc.consume( - pmacc::memory::makeUnique( + std::make_unique( deviceHeap, *cellDesc, FrameType::getName() diff --git a/include/picongpu/particles/flylite/NonLTE.tpp b/include/picongpu/particles/flylite/NonLTE.tpp index aae44341bc..9c840cbb09 100644 --- a/include/picongpu/particles/flylite/NonLTE.tpp +++ b/include/picongpu/particles/flylite/NonLTE.tpp @@ -27,11 +27,9 @@ #include "picongpu/particles/particleToGrid/derivedAttributes/Density.def" #include "picongpu/particles/traits/GetShape.hpp" -/* pmacc */ #include #include #include -#include #include @@ -64,11 +62,10 @@ namespace flylite DataConnector &dc = Environment<>::get().DataConnector(); - using pmacc::memory::makeUnique; // once allocated for all ion species to share if( ! dc.hasId( helperFields::LocalEnergyHistogram::getName( "electrons" ) ) ) dc.consume( - makeUnique< helperFields::LocalEnergyHistogram >( + std::make_unique< helperFields::LocalEnergyHistogram >( "electrons", m_avgGridSizeLocal ) @@ -76,7 +73,7 @@ namespace flylite if( ! dc.hasId( helperFields::LocalEnergyHistogram::getName( "photons" ) ) ) dc.consume( - makeUnique< helperFields::LocalEnergyHistogram >( + std::make_unique< helperFields::LocalEnergyHistogram >( "photons", m_avgGridSizeLocal ) @@ -84,7 +81,7 @@ namespace flylite if( ! dc.hasId( helperFields::LocalDensity::getName( "electrons" ) ) ) dc.consume( - makeUnique< helperFields::LocalDensity >( + std::make_unique< helperFields::LocalDensity >( "electrons", m_avgGridSizeLocal ) @@ -93,7 +90,7 @@ namespace flylite // for each ion species if( ! dc.hasId( helperFields::LocalRateMatrix::getName( ionSpeciesName ) ) ) dc.consume( - makeUnique< helperFields::LocalRateMatrix >( + std::make_unique< helperFields::LocalRateMatrix >( ionSpeciesName, m_avgGridSizeLocal ) @@ -101,7 +98,7 @@ namespace flylite if( ! dc.hasId( helperFields::LocalDensity::getName( ionSpeciesName ) ) ) dc.consume( - makeUnique< helperFields::LocalDensity >( + std::make_unique< helperFields::LocalDensity >( ionSpeciesName, m_avgGridSizeLocal ) diff --git a/include/picongpu/plugins/xrayScattering/XrayScattering.hpp b/include/picongpu/plugins/xrayScattering/XrayScattering.hpp index 85c9d0c5b6..c52d32d6fb 100644 --- a/include/picongpu/plugins/xrayScattering/XrayScattering.hpp +++ b/include/picongpu/plugins/xrayScattering/XrayScattering.hpp @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -290,7 +289,7 @@ namespace xrayScattering * CoordinateTransform.hpp is still set to (0,0,0) when the * XrayScattering object is initialized. */ - probingBeam = pmacc::memory::makeUnique< + probingBeam = std::make_unique< beam::XrayScatteringBeam >( ); // Set the steps at which the xrayScattering amplitude is // calculated. @@ -337,7 +336,7 @@ namespace xrayScattering ); } // Allocate amplitude buffer. - amplitude = pmacc::memory::makeUnique< ComplexBuffer >( + amplitude = std::make_unique< ComplexBuffer >( DataSpace< DIM1 >( bufferSize ) ); // Initialize, on device, its fields with zero. amplitude->getDeviceBuffer( ).setValue( 0.0 ); @@ -376,7 +375,7 @@ namespace xrayScattering ).getGlobalDomain( ).size.productOfComponents( ); // Initialize an object responsible for output writing. - dataWriter = pmacc::memory::makeUnique< XrayScatteringWriter< + dataWriter = std::make_unique< XrayScatteringWriter< float_X > >( pluginPrefix + "Output", fileExtension, diff --git a/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp b/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp index bec1dbcab4..f48680e84b 100644 --- a/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp +++ b/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp @@ -213,7 +213,7 @@ namespace xrayScattering if ( outputMemoryLayout == OutputMemoryLayout::Distribute ) { // Open a series for a parallel write. - openPMDSeries = pmacc::memory::makeUnique< ::openPMD::Series >( + openPMDSeries = std::make_unique< ::openPMD::Series >( fullName, at, mpiCommunicator @@ -222,7 +222,7 @@ namespace xrayScattering else { // Open a series for a serial write. - openPMDSeries = pmacc::memory::makeUnique< ::openPMD::Series >( + openPMDSeries = std::make_unique< ::openPMD::Series >( fullName, at ); diff --git a/include/picongpu/simulation/control/MySimulation.hpp b/include/picongpu/simulation/control/MySimulation.hpp index 6e038578b9..cc12a59768 100644 --- a/include/picongpu/simulation/control/MySimulation.hpp +++ b/include/picongpu/simulation/control/MySimulation.hpp @@ -341,7 +341,7 @@ class MySimulation : public SimulationHelper ); using RNGFactory = pmacc::random::RNGProvider< simDim, random::Generator >; - auto rngFactory = pmacc::memory::makeUnique< RNGFactory >( + auto rngFactory = std::make_unique< RNGFactory >( Environment::get().SubGrid().getLocalDomain().size ); if (Environment::get().GridController().getGlobalRank() == 0) @@ -444,7 +444,7 @@ class MySimulation : public SimulationHelper ); cuplaStreamSynchronize( 0 ); - auto mallocMCBuffer = pmacc::memory::makeUnique< MallocMCBuffer >( deviceHeap ); + auto mallocMCBuffer = std::make_unique< MallocMCBuffer >( deviceHeap ); dc.consume( std::move( mallocMCBuffer ) ); #endif meta::ForEach< VectorAllSpecies, particles::LogMemoryStatisticsForSpecies > logMemoryStatisticsForSpecies; @@ -683,16 +683,15 @@ class MySimulation : public SimulationHelper void initFields( DataConnector& dataConnector ) { - using pmacc::memory::makeUnique; - auto fieldB = makeUnique< FieldB >( *cellDescription ); + auto fieldB = std::make_unique< FieldB >( *cellDescription ); dataConnector.consume( std::move( fieldB ) ); - auto fieldE = makeUnique< FieldE >( *cellDescription ); + auto fieldE = std::make_unique< FieldE >( *cellDescription ); dataConnector.consume( std::move( fieldE ) ); - auto fieldJ = makeUnique< FieldJ >( *cellDescription ); + auto fieldJ = std::make_unique< FieldJ >( *cellDescription ); dataConnector.consume( std::move( fieldJ ) ); for( uint32_t slot = 0; slot < fieldTmpNumSlots; ++slot) { - auto fieldTmp = makeUnique< FieldTmp >( *cellDescription, slot ); + auto fieldTmp = std::make_unique< FieldTmp >( *cellDescription, slot ); dataConnector.consume( std::move( fieldTmp ) ); } } diff --git a/include/pmacc/memory/buffers/ExchangeIntern.hpp b/include/pmacc/memory/buffers/ExchangeIntern.hpp index feb86ca3b0..9d4c5ce671 100644 --- a/include/pmacc/memory/buffers/ExchangeIntern.hpp +++ b/include/pmacc/memory/buffers/ExchangeIntern.hpp @@ -27,7 +27,6 @@ #include "pmacc/memory/dataTypes/Mask.hpp" #include "pmacc/memory/buffers/DeviceBufferIntern.hpp" #include "pmacc/memory/buffers/HostBufferIntern.hpp" -#include "pmacc/memory/MakeUnique.hpp" #include "pmacc/eventSystem/tasks/Factory.hpp" #include "pmacc/eventSystem/tasks/TaskReceive.hpp" @@ -72,7 +71,7 @@ namespace pmacc /*This is only a pointer to other device data */ using DeviceBuffer = DeviceBufferIntern; - deviceBuffer = memory::makeUnique( + deviceBuffer = std::make_unique( source, tmp_size, exchangeTypeToOffset( @@ -86,7 +85,7 @@ namespace pmacc if (DIM > DIM1) { /*create double buffer on gpu for faster memory transfers*/ - deviceDoubleBuffer = memory::makeUnique( + deviceDoubleBuffer = std::make_unique( tmp_size, false, true @@ -96,7 +95,7 @@ namespace pmacc if(!Environment<>::get().isMpiDirectEnabled()) { using HostBuffer = HostBufferIntern; - hostBuffer = memory::makeUnique(tmp_size); + hostBuffer = std::make_unique(tmp_size); } } @@ -105,7 +104,7 @@ namespace pmacc Exchange(exchange, communicationTag), deviceDoubleBuffer(nullptr), hostBuffer(nullptr) { using DeviceBuffer = DeviceBufferIntern; - deviceBuffer = memory::makeUnique( + deviceBuffer = std::make_unique( exchangeDataSpace, sizeOnDevice ); @@ -113,7 +112,7 @@ namespace pmacc if (DIM > DIM1) { /*create double buffer on gpu for faster memory transfers*/ - deviceDoubleBuffer = memory::makeUnique( + deviceDoubleBuffer = std::make_unique( exchangeDataSpace, false, true @@ -123,7 +122,7 @@ namespace pmacc if(!Environment<>::get().isMpiDirectEnabled()) { using HostBuffer = HostBufferIntern; - hostBuffer = memory::makeUnique(exchangeDataSpace); + hostBuffer = std::make_unique(exchangeDataSpace); } } diff --git a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp index 2fd631723e..d56ece4e8f 100644 --- a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp +++ b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -246,7 +245,7 @@ namespace kernel Space const & guardSize ) { - mapping = memory::makeUnique< T_MappingDesc >( + mapping = std::make_unique< T_MappingDesc >( layout, guardSize ); From 84b8c26ea34fbda0a2b11ba3425b0bc74a39f2e5 Mon Sep 17 00:00:00 2001 From: Sergei Bastrakov Date: Thu, 17 Sep 2020 09:43:31 +0200 Subject: [PATCH 10/13] Remove pmacc::memory::makeUnique that is no longer used --- include/pmacc/memory/MakeUnique.hpp | 48 ----------------------------- 1 file changed, 48 deletions(-) delete mode 100644 include/pmacc/memory/MakeUnique.hpp diff --git a/include/pmacc/memory/MakeUnique.hpp b/include/pmacc/memory/MakeUnique.hpp deleted file mode 100644 index 38d92ad6bf..0000000000 --- a/include/pmacc/memory/MakeUnique.hpp +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright 2019-2020 Sergei Bastrakov - * - * This file is part of PMacc. - * - * PMacc is free software: you can redistribute it and/or modify - * it under the terms of either the GNU General Public License or - * the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * PMacc is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License and the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License - * and the GNU Lesser General Public License along with PMacc. - * If not, see . - */ - -#pragma once - -#include -#include - - -namespace pmacc -{ -namespace memory -{ - - /* - * Analogue of std::make_unique for C++11, except not disabled for arrays. - * Implementation is taken from - * https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique - */ - template< - typename T, - typename ... T_Args - > - inline std::unique_ptr< T > makeUnique( T_Args && ... args ) - { - return std::unique_ptr< T >( new T( std::forward< T_Args >( args ) ... ) ); - } - -} // namespace memory -} // namespace pmacc From 8aefa63503dbfff92981f56da575c6a51bb4f459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Fri, 18 Sep 2020 09:14:46 +0200 Subject: [PATCH 11/13] HIP: RNG XorMin Add HIP support for random number generator XorMin. --- include/pmacc/Environment.hpp | 10 ++++- include/pmacc/random/methods/XorMin.hpp | 51 +++++++++++++++++++------ 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/include/pmacc/Environment.hpp b/include/pmacc/Environment.hpp index b35facbab4..7fe06a5554 100644 --- a/include/pmacc/Environment.hpp +++ b/include/pmacc/Environment.hpp @@ -490,9 +490,15 @@ namespace detail const int tryDeviceId = (deviceOffset + deviceNumber) % num_gpus; log("Trying to allocate device %1%.") % tryDeviceId; -#if (PMACC_CUDA_ENABLED == 1) + +#if(BOOST_LANG_CUDA || BOOST_LANG_HIP) +# if(BOOST_LANG_CUDA) cudaDeviceProp devProp; - CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&devProp, tryDeviceId)); +# elif(BOOST_LANG_HIP) + hipDeviceProp_t devProp; +# endif + + CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId)); /* If the cuda gpu compute mode is 'default' * (https://docs.nvidia.com/cuda/cuda-c-programming-guide/#compute-modes) diff --git a/include/pmacc/random/methods/XorMin.hpp b/include/pmacc/random/methods/XorMin.hpp index 10850befcf..1344b24310 100644 --- a/include/pmacc/random/methods/XorMin.hpp +++ b/include/pmacc/random/methods/XorMin.hpp @@ -24,10 +24,12 @@ #include "pmacc/types.hpp" #include "pmacc/static_assert.hpp" -#if( PMACC_CUDA_ENABLED != 1 ) -# include "pmacc/random/methods/AlpakaRand.hpp" -#else +#if( BOOST_LANG_CUDA ) # include +#elif( BOOST_LANG_HIP ) +# include +#else +# include "pmacc/random/methods/AlpakaRand.hpp" #endif @@ -38,15 +40,17 @@ namespace random namespace methods { -#if( PMACC_CUDA_ENABLED != 1 ) - //! fallback to alpaka RNG if a cpu accelerator is used - template< typename T_Acc = cupla::Acc> - using XorMin = AlpakaRand< T_Acc >; -#else +#if( BOOST_LANG_CUDA || BOOST_LANG_HIP ) //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution template< typename T_Acc = cupla::Acc> class XorMin { +#if (BOOST_LANG_HIP) + using NativeStateType = hiprandStateXORWOW_t; +#elif (BOOST_LANG_CUDA) + using NativeStateType = curandStateXORWOW_t; +#endif + public: class StateType { @@ -63,14 +67,23 @@ namespace methods HDINLINE StateType( ) { } - DINLINE StateType( curandStateXORWOW_t const & other ): d( other.d ) + DINLINE StateType( NativeStateType const & other ): d( other.d ) { +#if (BOOST_LANG_HIP) + auto const* nativeStateArray = other.x; + PMACC_STATIC_ASSERT_MSG( + sizeof( v ) == sizeof( other.x ), + Unexpected_sizes + ); +#elif (BOOST_LANG_CUDA) + auto const* nativeStateArray = other.v; PMACC_STATIC_ASSERT_MSG( sizeof( v ) == sizeof( other.v ), Unexpected_sizes ); +#endif for( unsigned i = 0; i < sizeof( v ) / sizeof( v[ 0 ] ); i++ ) - v[ i ] = other.v[ i ]; + v[ i ] = nativeStateArray[ i ]; } }; @@ -82,13 +95,23 @@ namespace methods uint32_t subsequence = 0 ) const { - curandStateXORWOW_t tmpState; - curand_init( + NativeStateType tmpState; + +#if (BOOST_LANG_HIP) +# define define PMACC_RNG_INIT_FN hiprand_init +#elif (BOOST_LANG_CUDA) +# define define PMACC_RNG_INIT_FN curand_init +#endif + + PMACC_RNG_INIT_FN( seed, subsequence, 0, &tmpState ); + +#undef PMACC_RNG_INIT_FN + state = tmpState; } @@ -132,6 +155,10 @@ namespace methods return "XorMin"; } }; +#else + //! fallback to alpaka RNG if a cpu accelerator is used + template< typename T_Acc = cupla::Acc> + using XorMin = AlpakaRand< T_Acc >; #endif } // namespace methods } // namespace random From a27b90e48ffc81092271a680aa207879ead07936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Fri, 18 Sep 2020 09:16:35 +0200 Subject: [PATCH 12/13] HIP compatibility Increase HIP compatibility to PMacc and PIConGPU. --- include/picongpu/_defaultParam.loader | 2 +- .../fields/currentDeposition/Strategy.def | 14 +++++++++++++ include/picongpu/particles/Particles.hpp | 2 +- .../picongpu/particles/ParticlesFunctors.hpp | 2 +- .../simulation/control/MySimulation.hpp | 12 +++++++---- include/pmacc/Environment.hpp | 15 +++++++------- include/pmacc/PMaccConfig.cmake | 2 +- .../algorithms/math/doubleMath/bessel.tpp | 16 +++++++-------- .../math/doubleMath/floatingPoint.tpp | 6 +++--- .../pmacc/algorithms/math/doubleMath/modf.tpp | 2 +- .../algorithms/math/floatMath/bessel.tpp | 16 +++++++-------- .../pmacc/algorithms/math/floatMath/exp.tpp | 2 +- .../math/floatMath/floatingPoint.tpp | 6 +++--- .../pmacc/algorithms/math/floatMath/modf.tpp | 2 +- include/pmacc/math/ConstVector.hpp | 6 +++--- .../particles/memory/boxes/ParticlesBox.hpp | 20 ++++++++++--------- include/pmacc/random/methods/XorMin.hpp | 18 ++++++++--------- include/pmacc/static_assert.hpp | 2 +- include/pmacc/types.hpp | 2 +- 19 files changed, 83 insertions(+), 64 deletions(-) diff --git a/include/picongpu/_defaultParam.loader b/include/picongpu/_defaultParam.loader index c9d3b14f75..04c53a8c58 100644 --- a/include/picongpu/_defaultParam.loader +++ b/include/picongpu/_defaultParam.loader @@ -26,7 +26,7 @@ #pragma once #include "picongpu/param/dimension.param" -#if( PMACC_CUDA_ENABLED == 1 ) +#if(BOOST_LANG_CUDA || BOOST_COMP_HIP) # include "picongpu/param/mallocMC.param" #endif #include "picongpu/param/memory.param" diff --git a/include/picongpu/fields/currentDeposition/Strategy.def b/include/picongpu/fields/currentDeposition/Strategy.def index 6950665ccd..f4963bea30 100644 --- a/include/picongpu/fields/currentDeposition/Strategy.def +++ b/include/picongpu/fields/currentDeposition/Strategy.def @@ -143,6 +143,20 @@ namespace traits alpaka::acc::AccGpuCudaRt< T_Args... > > { + // GPU Utilization is higher compared to `StridedCachedSupercells` + using type = strategy::CachedSupercells; + }; +#endif + +#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 ) + template< + typename ... T_Args + > + struct GetDefaultStrategy< + alpaka::acc::AccGpuHipRt< T_Args... > + > + { + // GPU Utilization is higher compared to `StridedCachedSupercells` using type = strategy::CachedSupercells; }; #endif diff --git a/include/picongpu/particles/Particles.hpp b/include/picongpu/particles/Particles.hpp index 08aaccb71d..eda48ded67 100644 --- a/include/picongpu/particles/Particles.hpp +++ b/include/picongpu/particles/Particles.hpp @@ -49,7 +49,7 @@ namespace picongpu { using namespace pmacc; -#if( PMACC_CUDA_ENABLED != 1 ) +#if(!BOOST_LANG_CUDA && !BOOST_COMP_HIP) /* dummy because we are not using mallocMC with cupla * DeviceHeap is defined in `mallocMC.param` */ diff --git a/include/picongpu/particles/ParticlesFunctors.hpp b/include/picongpu/particles/ParticlesFunctors.hpp index 103aa436f9..4630bfcf96 100644 --- a/include/picongpu/particles/ParticlesFunctors.hpp +++ b/include/picongpu/particles/ParticlesFunctors.hpp @@ -125,7 +125,7 @@ struct LogMemoryStatisticsForSpecies const std::shared_ptr& deviceHeap ) const { -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) log("mallocMC: free slots for species %3%: %1% a %2%") % deviceHeap->getAvailableSlots( cupla::manager::Device< cupla::AccDev >::get().current(), diff --git a/include/picongpu/simulation/control/MySimulation.hpp b/include/picongpu/simulation/control/MySimulation.hpp index cc12a59768..57395bab49 100644 --- a/include/picongpu/simulation/control/MySimulation.hpp +++ b/include/picongpu/simulation/control/MySimulation.hpp @@ -373,7 +373,9 @@ class MySimulation : public SimulationHelper this->bremsstrahlungPhotonAngle.init(); } +#endif +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) auto nativeCudaStream = cupla::manager::Stream< cupla::AccDev, cupla::AccStream @@ -425,7 +427,7 @@ class MySimulation : public SimulationHelper throw std::runtime_error(msg.str()); } -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) size_t heapSize = freeGpuMem - reservedGpuMemorySize; if( Environment<>::get().MemoryInfo().isSharedMemoryPool() ) @@ -443,10 +445,12 @@ class MySimulation : public SimulationHelper heapSize ); cuplaStreamSynchronize( 0 ); - - auto mallocMCBuffer = std::make_unique< MallocMCBuffer >( deviceHeap ); +# if( PMACC_CUDA_ENABLED == 1 ) + auto mallocMCBuffer = std::make_unique< MallocMCBuffer< DeviceHeap > >( deviceHeap ); dc.consume( std::move( mallocMCBuffer ) ); +# endif #endif + meta::ForEach< VectorAllSpecies, particles::LogMemoryStatisticsForSpecies > logMemoryStatisticsForSpecies; logMemoryStatisticsForSpecies( deviceHeap ); @@ -455,7 +459,7 @@ class MySimulation : public SimulationHelper IdProvider::init(); -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* add CUDA streams to the StreamController for concurrent execution */ Environment<>::get().StreamController().addStreams(6); #endif diff --git a/include/pmacc/Environment.hpp b/include/pmacc/Environment.hpp index 7fe06a5554..7d222d65d2 100644 --- a/include/pmacc/Environment.hpp +++ b/include/pmacc/Environment.hpp @@ -468,7 +468,7 @@ namespace detail { int num_gpus = 0; //number of gpus cuplaGetDeviceCount(&num_gpus); -#if (PMACC_CUDA_ENABLED == 1) +#if(BOOST_LANG_CUDA|| BOOST_COMP_HIP) //##ERROR handling if (num_gpus < 1) //check if cupla device is found { @@ -506,7 +506,7 @@ namespace detail * The index used to select a device is based on the local MPI rank so * that each rank tries a different device. */ - if (devProp.computeMode == cudaComputeModeDefault) + if (devProp.computeMode == ALPAKA_API_PREFIX(ComputeModeDefault)) { maxTries = 1; log("Device %1% is running in default mode.") % tryDeviceId; @@ -532,18 +532,17 @@ namespace detail if (rc == cuplaSuccess) { -#if (PMACC_CUDA_ENABLED == 1) - cudaDeviceProp dprop; - CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&dprop, tryDeviceId)); - log ("Set device to %1%: %2%") % tryDeviceId % dprop.name; - if(cudaErrorSetOnActiveProcess == cudaSetDeviceFlags(cudaDeviceScheduleSpin)) +#if(BOOST_LANG_CUDA || BOOST_LANG_HIP) + CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId)); + log ("Set device to %1%: %2%") % tryDeviceId % devProp.name; + if(ALPAKA_API_PREFIX(ErrorSetOnActiveProcess) == ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin))) { cuplaGetLastError(); //reset all errors /* - because of cuplaStreamCreate was called cuplaSetDeviceFlags crashed * - to set the flags reset the device and set flags again */ CUDA_CHECK(cuplaDeviceReset()); - CUDA_CHECK((cuplaError_t)cudaSetDeviceFlags(cudaDeviceScheduleSpin)); + CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin))); } #endif CUDA_CHECK(cuplaGetLastError()); diff --git a/include/pmacc/PMaccConfig.cmake b/include/pmacc/PMaccConfig.cmake index 2878eacee3..2c0a14e481 100644 --- a/include/pmacc/PMaccConfig.cmake +++ b/include/pmacc/PMaccConfig.cmake @@ -368,7 +368,7 @@ endif() # Find mallocMC ################################################################################ -if(ALPAKA_ACC_GPU_CUDA_ENABLE) +if(ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE) set(mallocMC_ALPAKA_PROVIDER "extern" CACHE STRING "Select which alpaka is used for mallocMC") find_package(mallocMC 2.5.0 QUIET) diff --git a/include/pmacc/algorithms/math/doubleMath/bessel.tpp b/include/pmacc/algorithms/math/doubleMath/bessel.tpp index c6f4af59be..b099bef2f8 100644 --- a/include/pmacc/algorithms/math/doubleMath/bessel.tpp +++ b/include/pmacc/algorithms/math/doubleMath/bessel.tpp @@ -39,7 +39,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::cyl_bessel_i0( x ); #else return boost::math::cyl_bessel_i( @@ -57,7 +57,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::cyl_bessel_i1( x ); #else return boost::math::cyl_bessel_i( @@ -75,7 +75,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::j0( x ); #else return boost::math::cyl_bessel_j( @@ -93,7 +93,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::j1( x ); #else return boost::math::cyl_bessel_j( @@ -117,7 +117,7 @@ namespace bessel result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::jn( n, x @@ -138,7 +138,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::y0( x ); #else return boost::math::cyl_neumann( @@ -156,7 +156,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::y1( x ); #else return boost::math::cyl_neumann( @@ -180,7 +180,7 @@ namespace bessel result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::yn( n, x diff --git a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp index 2f32967fd3..ec3d7c7a46 100644 --- a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp +++ b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp @@ -39,7 +39,7 @@ struct Float2int_ru HDINLINE result operator( )(double value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__double2int_ru( value ); #else return static_cast(ceil(value)); @@ -54,7 +54,7 @@ struct Float2int_rd HDINLINE result operator( )(double value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__double2int_rd( value ); #else return static_cast(floor(value)); @@ -69,7 +69,7 @@ struct Float2int_rn HDINLINE result operator( )(double value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__double2int_rn( value ); #else if(value < 0.0) diff --git a/include/pmacc/algorithms/math/doubleMath/modf.tpp b/include/pmacc/algorithms/math/doubleMath/modf.tpp index 92ec4741da..b1532568c2 100644 --- a/include/pmacc/algorithms/math/doubleMath/modf.tpp +++ b/include/pmacc/algorithms/math/doubleMath/modf.tpp @@ -36,7 +36,7 @@ struct Modf HDINLINE double operator()(double value, double* intpart) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::modf(value, intpart); #else return std::modf(value, intpart); diff --git a/include/pmacc/algorithms/math/floatMath/bessel.tpp b/include/pmacc/algorithms/math/floatMath/bessel.tpp index e627ee012e..15554587d6 100644 --- a/include/pmacc/algorithms/math/floatMath/bessel.tpp +++ b/include/pmacc/algorithms/math/floatMath/bessel.tpp @@ -39,7 +39,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::cyl_bessel_i0f( x ); #else return boost::math::cyl_bessel_i( @@ -57,7 +57,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::cyl_bessel_i1f( x ); #else return boost::math::cyl_bessel_i( @@ -75,7 +75,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu_ return ::j0f( x ); #else return boost::math::cyl_bessel_j( @@ -93,7 +93,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::j1f( x ); #else return boost::math::cyl_bessel_j( @@ -117,7 +117,7 @@ namespace bessel result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::jnf( n, x @@ -138,7 +138,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::y0f( x ); #else return boost::math::cyl_neumann( @@ -156,7 +156,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::y1f( x ); #else return boost::math::cyl_neumann( @@ -180,7 +180,7 @@ namespace bessel result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::ynf( n, x diff --git a/include/pmacc/algorithms/math/floatMath/exp.tpp b/include/pmacc/algorithms/math/floatMath/exp.tpp index 772dcf87a9..97ae7e0d13 100644 --- a/include/pmacc/algorithms/math/floatMath/exp.tpp +++ b/include/pmacc/algorithms/math/floatMath/exp.tpp @@ -38,7 +38,7 @@ namespace math HDINLINE float operator( )(const float& value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::log10f( value ); #else return ::log10( value ); diff --git a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp index 206b0118f1..681f33e21a 100644 --- a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp +++ b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp @@ -39,7 +39,7 @@ struct Float2int_ru HDINLINE result operator( )(float value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__float2int_ru( value ); #else return static_cast(ceil(value)); @@ -54,7 +54,7 @@ struct Float2int_rd HDINLINE result operator( )(float value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__float2int_rd( value ); #else return static_cast(floor(value)); @@ -69,7 +69,7 @@ struct Float2int_rn HDINLINE result operator( )(float value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__float2int_rn( value ); #else if(value < 0.0f) diff --git a/include/pmacc/algorithms/math/floatMath/modf.tpp b/include/pmacc/algorithms/math/floatMath/modf.tpp index d2678d179e..59efffd3ae 100644 --- a/include/pmacc/algorithms/math/floatMath/modf.tpp +++ b/include/pmacc/algorithms/math/floatMath/modf.tpp @@ -36,7 +36,7 @@ struct Modf HDINLINE float operator()(float value, float* intpart) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::modff(value, intpart); #else return std::modf(value, intpart); diff --git a/include/pmacc/math/ConstVector.hpp b/include/pmacc/math/ConstVector.hpp index b1fbad1b0a..b2b4df31de 100644 --- a/include/pmacc/math/ConstVector.hpp +++ b/include/pmacc/math/ConstVector.hpp @@ -26,13 +26,13 @@ #include "pmacc/types.hpp" /* select namespace depending on __CUDA_ARCH__ compiler flag*/ -#ifdef __CUDA_ARCH__ //we are on gpu +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu # define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_device,id) #else # define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_host,id) #endif -#ifdef __CUDACC__ +#if defined(__CUDACC__) || BOOST_COMP_HIP # define PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id,Name,Type,...) \ namespace PMACC_JOIN(pmacc_static_const_vector_device,id) \ { \ @@ -87,7 +87,7 @@ namespace PMACC_JOIN(pmacc_static_const_storage,id) \ } /* namespace pmacc_static_const_storage + id */ \ using namespace PMACC_JOIN(pmacc_static_const_storage,id) -#ifdef __CUDACC__ +#if defined(__CUDACC__) || BOOST_COMP_HIP # define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name,id) \ namespace PMACC_JOIN(pmacc_static_const_vector_device,id) \ { \ diff --git a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp index 031aae887b..aabb323990 100644 --- a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp +++ b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp @@ -22,7 +22,7 @@ #pragma once -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) # include #endif #include "pmacc/particles/frame_types.hpp" @@ -97,7 +97,7 @@ class ParticlesBox : protected DataBox, DIM> > const int maxTries = 13; //magic number is not performance critical for ( int numTries = 0; numTries < maxTries; ++numTries ) { -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) tmp = (FrameType*) m_deviceHeapHandle.malloc( acc, sizeof (FrameType) ); #else tmp = new FrameType; @@ -107,7 +107,7 @@ class ParticlesBox : protected DataBox, DIM> > /* disable all particles since we can not assume that newly allocated memory contains zeros */ for ( int i = 0; i < (int) math::CT::volume::type::value; ++i ) ( *tmp )[i][multiMask_] = 0; -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* takes care that changed values are visible to all threads inside this block*/ __threadfence_block( ); #endif @@ -115,10 +115,12 @@ class ParticlesBox : protected DataBox, DIM> > } else { +#ifndef BOOST_COMP_HIP printf( "%s: mallocMC out of memory (try %i of %i)\n", (numTries + 1) == maxTries ? "ERROR" : "WARNING", numTries + 1, maxTries ); +#endif } } @@ -133,7 +135,7 @@ class ParticlesBox : protected DataBox, DIM> > template DINLINE void removeFrame( const T_Acc & acc, FramePtr& frame ) { -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) m_deviceHeapHandle.free( acc, (void*) frame.ptr ); #else delete(frame.ptr); @@ -144,14 +146,14 @@ class ParticlesBox : protected DataBox, DIM> > HDINLINE FramePtr mapPtr( const FramePtr& devPtr ) const { -#ifndef __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) + return devPtr; +#else int64_t useOffset = hostMemoryOffset * static_cast (devPtr.ptr != 0); return FramePtr( reinterpret_cast ( reinterpret_cast (devPtr.ptr) - useOffset ) ); -#else - return devPtr; #endif } @@ -218,7 +220,7 @@ class ParticlesBox : protected DataBox, DIM> > frame->previousFrame = FramePtr( ); frame->nextFrame = FramePtr( *firstFrameNativPtr ); -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* - takes care that `next[index]` is visible to all threads on the gpu * - this is needed because later on in this method we change `previous` * of an other frame, this must be done in order! @@ -267,7 +269,7 @@ class ParticlesBox : protected DataBox, DIM> > frame->nextFrame = FramePtr( ); frame->previousFrame = FramePtr( *lastFrameNativPtr ); -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* - takes care that `next[index]` is visible to all threads on the gpu * - this is needed because later on in this method we change `next` * of an other frame, this must be done in order! diff --git a/include/pmacc/random/methods/XorMin.hpp b/include/pmacc/random/methods/XorMin.hpp index 1344b24310..5822897d67 100644 --- a/include/pmacc/random/methods/XorMin.hpp +++ b/include/pmacc/random/methods/XorMin.hpp @@ -40,14 +40,14 @@ namespace random namespace methods { -#if( BOOST_LANG_CUDA || BOOST_LANG_HIP ) +#if( ALPAKA_ACC_GPU_CUDA_ENABLED || ALPAKA_ACC_GPU_HIP_ENABLED ) //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution template< typename T_Acc = cupla::Acc> class XorMin { -#if (BOOST_LANG_HIP) +#if( BOOST_LANG_HIP ) using NativeStateType = hiprandStateXORWOW_t; -#elif (BOOST_LANG_CUDA) +#elif( BOOST_LANG_CUDA ) using NativeStateType = curandStateXORWOW_t; #endif @@ -69,13 +69,13 @@ namespace methods DINLINE StateType( NativeStateType const & other ): d( other.d ) { -#if (BOOST_LANG_HIP) +#if( BOOST_LANG_HIP ) auto const* nativeStateArray = other.x; PMACC_STATIC_ASSERT_MSG( sizeof( v ) == sizeof( other.x ), Unexpected_sizes ); -#elif (BOOST_LANG_CUDA) +#elif( BOOST_LANG_CUDA ) auto const* nativeStateArray = other.v; PMACC_STATIC_ASSERT_MSG( sizeof( v ) == sizeof( other.v ), @@ -97,10 +97,10 @@ namespace methods { NativeStateType tmpState; -#if (BOOST_LANG_HIP) -# define define PMACC_RNG_INIT_FN hiprand_init -#elif (BOOST_LANG_CUDA) -# define define PMACC_RNG_INIT_FN curand_init +#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 ) +# define PMACC_RNG_INIT_FN hiprand_init +#elif( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 ) +# define PMACC_RNG_INIT_FN curand_init #endif PMACC_RNG_INIT_FN( diff --git a/include/pmacc/static_assert.hpp b/include/pmacc/static_assert.hpp index 7f279572e0..1f730bbf36 100644 --- a/include/pmacc/static_assert.hpp +++ b/include/pmacc/static_assert.hpp @@ -45,7 +45,7 @@ namespace pmacc * @param pmacc_unique_id pre compiler unique id * @param pmacc_typeInfo a type that is shown in error message */ -#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA +#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP /* device compile with clang: boost static assert can not be used * error is: calling a `__host__` function from `__device__` * Therefore C++11 `static_assert` is used diff --git a/include/pmacc/types.hpp b/include/pmacc/types.hpp index 826219474d..7230641f36 100644 --- a/include/pmacc/types.hpp +++ b/include/pmacc/types.hpp @@ -33,7 +33,7 @@ # define PMACC_CUDA_ENABLED ALPAKA_ACC_GPU_CUDA_ENABLED #endif -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* include mallocMC before cupla renaming is activated, else we need the variable acc * to call atomic cuda functions */ From 315bd9a4286beb370ba287dac47dd25b9b0ec5a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Thu, 17 Sep 2020 11:28:22 +0200 Subject: [PATCH 13/13] radiation plugin: add new option Add runtime option `numTmpResults` to increase independent work on a device. If set to one the code will behaves as before this PR. The default will is two to utilize modern GPU devices with a typicaly configuration. --- docs/TBG_macros.cfg | 3 +- docs/source/usage/plugins/radiation.rst | 8 ++++- .../picongpu/plugins/radiation/Radiation.hpp | 33 +++++++++++++++---- .../plugins/radiation/Radiation.kernel | 8 +++-- 4 files changed, 41 insertions(+), 11 deletions(-) diff --git a/docs/TBG_macros.cfg b/docs/TBG_macros.cfg index e11947e906..c09d4ff815 100644 --- a/docs/TBG_macros.cfg +++ b/docs/TBG_macros.cfg @@ -124,7 +124,8 @@ TBG_stopWindow="--stopWindow 1337" #--_radiation.end Time step to stop calculating the radiation #--_radiation.radPerGPU If flag is set, each GPU stores its own spectra without summing the entire simulation area #--_radiation.folderRadPerGPU Folder where the GPU specific spectras are stored -#--e__radiation.compression If flag is set, the hdf5 output will be compressed. +#--_radiation.compression If flag is set, the hdf5 output will be compressed. +#--_radiation.numJobs Number of independent jobs used for the radiation calculation. TBG_radiation="--_radiation.period 1 --_radiation.dump 2 --_radiation.totalRadiation \ --_radiation.lastRadiation --_radiation.start 2800 --_radiation.end 3000" diff --git a/docs/source/usage/plugins/radiation.rst b/docs/source/usage/plugins/radiation.rst index 002035a3b4..a47cc8580e 100644 --- a/docs/source/usage/plugins/radiation.rst +++ b/docs/source/usage/plugins/radiation.rst @@ -287,6 +287,11 @@ Command line option Description ``--_radiation.folderRadPerGPU`` Name of the folder, where the GPU specific spectra are stored. Default: ``radPerGPU`` ``--_radiation.compression`` If set, the hdf5 output is compressed. +``--_radiation.numJobs`` Number of independent jobs used for the radiation calculation. + This option is used to increase the utilization of the device by producing more independent work. + This option enables accumulation of data in parallel into multiple temporary arrays, thereby increasing the utilization of + the device by increasing the memory footprint + Default: ``2`` ========================================= ============================================================================================================================== Memory Complexity @@ -295,7 +300,8 @@ Memory Complexity Accelerator """"""""""" -each energy bin times each coordinate bin allocates one counter (``float_X``) permanently and on each accelerator. +locally, ``numJobs`` times number of frequencies ``N_omega`` times number of directions ``N_theta`` is permanently allocated. +Each result element (amplitude) is a double precision complex number. Host """" diff --git a/include/picongpu/plugins/radiation/Radiation.hpp b/include/picongpu/plugins/radiation/Radiation.hpp index d07700bc79..f05912cd4e 100644 --- a/include/picongpu/plugins/radiation/Radiation.hpp +++ b/include/picongpu/plugins/radiation/Radiation.hpp @@ -77,7 +77,6 @@ namespace idLabels }// end namespace idLabels - /////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////// Radiation Plugin Class //////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////// @@ -97,8 +96,10 @@ class Radiation : public ISimulationPlugin * frequency. Layout of the radiation array is: * [omega_1(theta_1),omega_2(theta_1),...,omega_N-omega(theta_1), * omega_1(theta_2),omega_2(theta_2),...,omega_N-omega(theta_N-theta)] + * The second dimension is used to store intermediate results if command + * line option numJobs is > 1. */ - GridBuffer *radiation; + GridBuffer *radiation; radiation_frequencies::InitFreqFunctor freqInit; radiation_frequencies::FreqFunctor freqFkt; @@ -119,6 +120,7 @@ class Radiation : public ISimulationPlugin bool radPerGPU; std::string folderRadPerGPU; DataSpace lastGPUpos; + int numJobs; /** * Data structure for storage and summation of the intermediate values of @@ -214,7 +216,8 @@ class Radiation : public ISimulationPlugin ((pluginPrefix + ".end").c_str(), po::value (&radEnd)->default_value(0), "time index when radiation should end with calculation") ((pluginPrefix + ".radPerGPU").c_str(), po::bool_switch(&radPerGPU), "enable radiation output from each GPU individually") ((pluginPrefix + ".folderRadPerGPU").c_str(), po::value (&folderRadPerGPU)->default_value("radPerGPU"), "folder in which the radiation of each GPU is written") - ((pluginPrefix + ".compression").c_str(), po::bool_switch(&compressionOn), "enable compression of hdf5 output"); + ((pluginPrefix + ".compression").c_str(), po::bool_switch(&compressionOn), "enable compression of hdf5 output") + ((pluginPrefix + ".numJobs").c_str(), po::value (&numJobs)->default_value(2), "Number of independent jobs used for the radiation calculation."); } @@ -282,13 +285,22 @@ class Radiation : public ISimulationPlugin { if(!notifyPeriod.empty()) { + if(numJobs <= 0) + { + std::cerr << "'numJobs' must be '>=1' value is adjusted from" << numJobs << " to '1'." << std::endl; + numJobs = 1; + } // allocate memory for all amplitudes for temporal data collection tmp_result = new Amplitude[elements_amplitude()]; /*only rank 0 create a file*/ isMaster = reduce.hasResult(mpi::reduceMethods::Reduce()); - radiation = new GridBuffer (DataSpace (elements_amplitude())); //create one int on GPU and host + /* Buffer for GPU results. + * The second dimension is used to store intermediate results if command + * line option numJobs is > 1. + */ + radiation = new GridBuffer (DataSpace<2>(elements_amplitude(), numJobs)); freqInit.Init(frequencies_from_list::listLocation); freqFkt = freqInit.getFunctor(); @@ -387,6 +399,15 @@ class Radiation : public ISimulationPlugin { radiation->deviceToHost(); __getTransactionEvent().waitForFinished(); + + auto dbox = radiation->getHostBuffer().getDataBox(); + int numAmp = elements_amplitude(); + // update the main result matrix (y index zero) + for( int resultIdx = 1; resultIdx < numJobs; ++resultIdx ) + for( int ampIdx = 0; ampIdx < numAmp; ++ampIdx ) + { + dbox(DataSpace< 2 >( ampIdx, 0 ) ) += dbox(DataSpace< 2 >( ampIdx, resultIdx ) ); + } } @@ -1188,8 +1209,8 @@ class Radiation : public ISimulationPlugin PMACC_KERNEL( KernelRadiationParticles< numWorkers >{} )( - gridDim_rad, - numWorkers + DataSpace< 2 >(gridDim_rad, numJobs), + DataSpace< 2 >(numWorkers,1) )( /*Pointer to particles memory on the device*/ particles->getDeviceParticlesBox(), diff --git a/include/picongpu/plugins/radiation/Radiation.kernel b/include/picongpu/plugins/radiation/Radiation.kernel index c81b030f21..13421f29cd 100644 --- a/include/picongpu/plugins/radiation/Radiation.kernel +++ b/include/picongpu/plugins/radiation/Radiation.kernel @@ -183,11 +183,13 @@ namespace radiation // get absolute number of relevant super cells int const numSuperCells = superCellsCount.productOfComponents(); + int const numJobs = cupla::gridDim(acc).y; + int const jobIdx = cupla::blockIdx(acc).y; - /* go over all super cells on GPU + /* go over all super cells on GPU with a stride depending on number of temporary results * but ignore all guarding supercells */ - for( int super_cell_index = 0; super_cell_index <= numSuperCells; ++super_cell_index ) + for( int super_cell_index = jobIdx; super_cell_index <= numSuperCells; super_cell_index += numJobs ) { // select SuperCell and add one sided guard again DataSpace< simDim > const superCell = @@ -481,7 +483,7 @@ namespace radiation * - from this (one) time step * - omega_id = theta_idx * radiation_frequencies::N_omega + o */ - radiation[ theta_idx * radiation_frequencies::N_omega + o] += amplitude; + radiation( DataSpace< 2 >(theta_idx * radiation_frequencies::N_omega + o, jobIdx ) ) += amplitude; } // end frequency loop