From 8aefa63503dbfff92981f56da575c6a51bb4f459 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Fri, 18 Sep 2020 09:14:46 +0200
Subject: [PATCH 1/2] HIP: RNG XorMin

Add HIP support for random number generator XorMin.
---
 include/pmacc/Environment.hpp           | 10 ++++-
 include/pmacc/random/methods/XorMin.hpp | 51 +++++++++++++++++++------
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/include/pmacc/Environment.hpp b/include/pmacc/Environment.hpp
index b35facbab4..7fe06a5554 100644
--- a/include/pmacc/Environment.hpp
+++ b/include/pmacc/Environment.hpp
@@ -490,9 +490,15 @@ namespace detail
             const int tryDeviceId = (deviceOffset + deviceNumber) % num_gpus;
 
             log<ggLog::CUDA_RT>("Trying to allocate device %1%.") % tryDeviceId;
-#if (PMACC_CUDA_ENABLED == 1)
+
+#if(BOOST_LANG_CUDA || BOOST_LANG_HIP)
+#   if(BOOST_LANG_CUDA)
             cudaDeviceProp devProp;
-            CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&devProp, tryDeviceId));
+#   elif(BOOST_LANG_HIP)
+            hipDeviceProp_t devProp;
+#   endif
+
+            CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId));
 
             /* If the cuda gpu compute mode is 'default'
              * (https://docs.nvidia.com/cuda/cuda-c-programming-guide/#compute-modes)
diff --git a/include/pmacc/random/methods/XorMin.hpp b/include/pmacc/random/methods/XorMin.hpp
index 10850befcf..1344b24310 100644
--- a/include/pmacc/random/methods/XorMin.hpp
+++ b/include/pmacc/random/methods/XorMin.hpp
@@ -24,10 +24,12 @@
 #include "pmacc/types.hpp"
 #include "pmacc/static_assert.hpp"
 
-#if( PMACC_CUDA_ENABLED != 1 )
-#   include "pmacc/random/methods/AlpakaRand.hpp"
-#else
+#if( BOOST_LANG_CUDA )
 #   include <curand_kernel.h>
+#elif( BOOST_LANG_HIP )
+#   include <hiprand_kernel.h>
+#else
+#   include "pmacc/random/methods/AlpakaRand.hpp"
 #endif
 
 
@@ -38,15 +40,17 @@ namespace random
 namespace methods
 {
 
-#if( PMACC_CUDA_ENABLED != 1 )
-    //! fallback to alpaka RNG if a cpu accelerator is used
-    template< typename T_Acc = cupla::Acc>
-    using XorMin = AlpakaRand< T_Acc >;
-#else
+#if( BOOST_LANG_CUDA || BOOST_LANG_HIP )
     //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution
     template< typename T_Acc = cupla::Acc>
     class XorMin
     {
+#if (BOOST_LANG_HIP)
+        using NativeStateType = hiprandStateXORWOW_t;
+#elif (BOOST_LANG_CUDA)
+        using NativeStateType = curandStateXORWOW_t;
+#endif
+
     public:
         class StateType
         {
@@ -63,14 +67,23 @@ namespace methods
             HDINLINE StateType( )
             { }
 
-            DINLINE StateType( curandStateXORWOW_t const & other ): d( other.d )
+            DINLINE StateType( NativeStateType const & other ): d( other.d )
             {
+#if (BOOST_LANG_HIP)
+                auto const* nativeStateArray = other.x;
+                PMACC_STATIC_ASSERT_MSG(
+                    sizeof( v ) == sizeof( other.x ),
+                    Unexpected_sizes
+                );
+#elif (BOOST_LANG_CUDA)
+                auto const* nativeStateArray = other.v;
                 PMACC_STATIC_ASSERT_MSG(
                     sizeof( v ) == sizeof( other.v ),
                     Unexpected_sizes
                 );
+#endif
                 for( unsigned i = 0; i < sizeof( v ) / sizeof( v[ 0 ] ); i++ )
-                    v[ i ] = other.v[ i ];
+                    v[ i ] = nativeStateArray[ i ];
             }
         };
 
@@ -82,13 +95,23 @@ namespace methods
             uint32_t subsequence = 0
         ) const
         {
-            curandStateXORWOW_t tmpState;
-            curand_init(
+            NativeStateType tmpState;
+
+#if (BOOST_LANG_HIP)
+#   define define PMACC_RNG_INIT_FN hiprand_init
+#elif (BOOST_LANG_CUDA)
+#   define define PMACC_RNG_INIT_FN curand_init
+#endif
+
+            PMACC_RNG_INIT_FN(
                 seed,
                 subsequence,
                 0,
                 &tmpState
             );
+
+#undef PMACC_RNG_INIT_FN
+
             state = tmpState;
         }
 
@@ -132,6 +155,10 @@ namespace methods
             return "XorMin";
         }
     };
+#else
+    //! fallback to alpaka RNG if a cpu accelerator is used
+    template< typename T_Acc = cupla::Acc>
+    using XorMin = AlpakaRand< T_Acc >;
 #endif
 }  // namespace methods
 }  // namespace random

From a27b90e48ffc81092271a680aa207879ead07936 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Fri, 18 Sep 2020 09:16:35 +0200
Subject: [PATCH 2/2] HIP compatibility

Increase HIP compatibility to PMacc and PIConGPU.
---
 include/picongpu/_defaultParam.loader         |  2 +-
 .../fields/currentDeposition/Strategy.def     | 14 +++++++++++++
 include/picongpu/particles/Particles.hpp      |  2 +-
 .../picongpu/particles/ParticlesFunctors.hpp  |  2 +-
 .../simulation/control/MySimulation.hpp       | 12 +++++++----
 include/pmacc/Environment.hpp                 | 15 +++++++-------
 include/pmacc/PMaccConfig.cmake               |  2 +-
 .../algorithms/math/doubleMath/bessel.tpp     | 16 +++++++--------
 .../math/doubleMath/floatingPoint.tpp         |  6 +++---
 .../pmacc/algorithms/math/doubleMath/modf.tpp |  2 +-
 .../algorithms/math/floatMath/bessel.tpp      | 16 +++++++--------
 .../pmacc/algorithms/math/floatMath/exp.tpp   |  2 +-
 .../math/floatMath/floatingPoint.tpp          |  6 +++---
 .../pmacc/algorithms/math/floatMath/modf.tpp  |  2 +-
 include/pmacc/math/ConstVector.hpp            |  6 +++---
 .../particles/memory/boxes/ParticlesBox.hpp   | 20 ++++++++++---------
 include/pmacc/random/methods/XorMin.hpp       | 18 ++++++++---------
 include/pmacc/static_assert.hpp               |  2 +-
 include/pmacc/types.hpp                       |  2 +-
 19 files changed, 83 insertions(+), 64 deletions(-)

diff --git a/include/picongpu/_defaultParam.loader b/include/picongpu/_defaultParam.loader
index c9d3b14f75..04c53a8c58 100644
--- a/include/picongpu/_defaultParam.loader
+++ b/include/picongpu/_defaultParam.loader
@@ -26,7 +26,7 @@
 #pragma once
 
 #include "picongpu/param/dimension.param"
-#if( PMACC_CUDA_ENABLED == 1 )
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
 #   include "picongpu/param/mallocMC.param"
 #endif
 #include "picongpu/param/memory.param"
diff --git a/include/picongpu/fields/currentDeposition/Strategy.def b/include/picongpu/fields/currentDeposition/Strategy.def
index 6950665ccd..f4963bea30 100644
--- a/include/picongpu/fields/currentDeposition/Strategy.def
+++ b/include/picongpu/fields/currentDeposition/Strategy.def
@@ -143,6 +143,20 @@ namespace traits
         alpaka::acc::AccGpuCudaRt< T_Args... >
     >
     {
+        // GPU Utilization is higher compared to `StridedCachedSupercells`
+        using type = strategy::CachedSupercells;
+    };
+#endif
+
+#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 )
+    template<
+        typename ... T_Args
+    >
+    struct GetDefaultStrategy<
+        alpaka::acc::AccGpuHipRt< T_Args... >
+    >
+    {
+        // GPU Utilization is higher compared to `StridedCachedSupercells`
         using type = strategy::CachedSupercells;
     };
 #endif
diff --git a/include/picongpu/particles/Particles.hpp b/include/picongpu/particles/Particles.hpp
index 08aaccb71d..eda48ded67 100644
--- a/include/picongpu/particles/Particles.hpp
+++ b/include/picongpu/particles/Particles.hpp
@@ -49,7 +49,7 @@ namespace picongpu
 {
 using namespace pmacc;
 
-#if( PMACC_CUDA_ENABLED != 1 )
+#if(!BOOST_LANG_CUDA && !BOOST_COMP_HIP)
 /* dummy because we are not using mallocMC with cupla
  * DeviceHeap is defined in `mallocMC.param`
  */
diff --git a/include/picongpu/particles/ParticlesFunctors.hpp b/include/picongpu/particles/ParticlesFunctors.hpp
index 103aa436f9..4630bfcf96 100644
--- a/include/picongpu/particles/ParticlesFunctors.hpp
+++ b/include/picongpu/particles/ParticlesFunctors.hpp
@@ -125,7 +125,7 @@ struct LogMemoryStatisticsForSpecies
         const std::shared_ptr<T_DeviceHeap>& deviceHeap
     ) const
     {
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         log<picLog::MEMORY >("mallocMC: free slots for species %3%: %1% a %2%") %
             deviceHeap->getAvailableSlots(
                 cupla::manager::Device< cupla::AccDev >::get().current(),
diff --git a/include/picongpu/simulation/control/MySimulation.hpp b/include/picongpu/simulation/control/MySimulation.hpp
index cc12a59768..57395bab49 100644
--- a/include/picongpu/simulation/control/MySimulation.hpp
+++ b/include/picongpu/simulation/control/MySimulation.hpp
@@ -373,7 +373,9 @@ class MySimulation : public SimulationHelper<simDim>
 
             this->bremsstrahlungPhotonAngle.init();
         }
+#endif
 
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         auto nativeCudaStream = cupla::manager::Stream<
             cupla::AccDev,
             cupla::AccStream
@@ -425,7 +427,7 @@ class MySimulation : public SimulationHelper<simDim>
             throw std::runtime_error(msg.str());
         }
 
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         size_t heapSize = freeGpuMem - reservedGpuMemorySize;
 
         if( Environment<>::get().MemoryInfo().isSharedMemoryPool() )
@@ -443,10 +445,12 @@ class MySimulation : public SimulationHelper<simDim>
             heapSize
         );
         cuplaStreamSynchronize( 0 );
-
-        auto mallocMCBuffer = std::make_unique< MallocMCBuffer<DeviceHeap> >( deviceHeap );
+#   if( PMACC_CUDA_ENABLED == 1 )
+        auto mallocMCBuffer = std::make_unique< MallocMCBuffer< DeviceHeap > >( deviceHeap );
         dc.consume( std::move( mallocMCBuffer ) );
+#   endif
 #endif
+
         meta::ForEach< VectorAllSpecies, particles::LogMemoryStatisticsForSpecies<bmpl::_1> > logMemoryStatisticsForSpecies;
         logMemoryStatisticsForSpecies( deviceHeap );
 
@@ -455,7 +459,7 @@ class MySimulation : public SimulationHelper<simDim>
 
         IdProvider<simDim>::init();
 
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         /* add CUDA streams to the StreamController for concurrent execution */
         Environment<>::get().StreamController().addStreams(6);
 #endif
diff --git a/include/pmacc/Environment.hpp b/include/pmacc/Environment.hpp
index 7fe06a5554..7d222d65d2 100644
--- a/include/pmacc/Environment.hpp
+++ b/include/pmacc/Environment.hpp
@@ -468,7 +468,7 @@ namespace detail
     {
         int num_gpus = 0; //number of gpus
         cuplaGetDeviceCount(&num_gpus);
-#if (PMACC_CUDA_ENABLED == 1)
+#if(BOOST_LANG_CUDA|| BOOST_COMP_HIP)
         //##ERROR handling
         if (num_gpus < 1) //check if cupla device is found
         {
@@ -506,7 +506,7 @@ namespace detail
              * The index used to select a device is based on the local MPI rank so
              * that each rank tries a different device.
              */
-            if (devProp.computeMode == cudaComputeModeDefault)
+            if (devProp.computeMode == ALPAKA_API_PREFIX(ComputeModeDefault))
             {
                 maxTries = 1;
                 log<ggLog::CUDA_RT>("Device %1% is running in default mode.") % tryDeviceId;
@@ -532,18 +532,17 @@ namespace detail
 
             if (rc == cuplaSuccess)
             {
-#if (PMACC_CUDA_ENABLED == 1)
-                cudaDeviceProp dprop;
-                CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&dprop, tryDeviceId));
-                log<ggLog::CUDA_RT> ("Set device to %1%: %2%") % tryDeviceId % dprop.name;
-                if(cudaErrorSetOnActiveProcess == cudaSetDeviceFlags(cudaDeviceScheduleSpin))
+#if(BOOST_LANG_CUDA || BOOST_LANG_HIP)
+                CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId));
+                log<ggLog::CUDA_RT> ("Set device to %1%: %2%") % tryDeviceId % devProp.name;
+                if(ALPAKA_API_PREFIX(ErrorSetOnActiveProcess) == ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin)))
                 {
                     cuplaGetLastError(); //reset all errors
                     /* - because of cuplaStreamCreate was called cuplaSetDeviceFlags crashed
                      * - to set the flags reset the device and set flags again
                      */
                     CUDA_CHECK(cuplaDeviceReset());
-                    CUDA_CHECK((cuplaError_t)cudaSetDeviceFlags(cudaDeviceScheduleSpin));
+                    CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin)));
                 }
 #endif
                 CUDA_CHECK(cuplaGetLastError());
diff --git a/include/pmacc/PMaccConfig.cmake b/include/pmacc/PMaccConfig.cmake
index 2878eacee3..2c0a14e481 100644
--- a/include/pmacc/PMaccConfig.cmake
+++ b/include/pmacc/PMaccConfig.cmake
@@ -368,7 +368,7 @@ endif()
 # Find mallocMC
 ################################################################################
 
-if(ALPAKA_ACC_GPU_CUDA_ENABLE)
+if(ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE)
     set(mallocMC_ALPAKA_PROVIDER "extern" CACHE STRING "Select which alpaka is used for mallocMC")
     find_package(mallocMC 2.5.0 QUIET)
 
diff --git a/include/pmacc/algorithms/math/doubleMath/bessel.tpp b/include/pmacc/algorithms/math/doubleMath/bessel.tpp
index c6f4af59be..b099bef2f8 100644
--- a/include/pmacc/algorithms/math/doubleMath/bessel.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/bessel.tpp
@@ -39,7 +39,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::cyl_bessel_i0( x );
 #else
             return boost::math::cyl_bessel_i(
@@ -57,7 +57,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::cyl_bessel_i1( x );
 #else
             return boost::math::cyl_bessel_i(
@@ -75,7 +75,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::j0( x );
 #else
             return boost::math::cyl_bessel_j(
@@ -93,7 +93,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::j1( x );
 #else
             return boost::math::cyl_bessel_j(
@@ -117,7 +117,7 @@ namespace bessel
             result const & x
         )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::jn(
                 n,
                 x
@@ -138,7 +138,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::y0( x );
 #else
             return boost::math::cyl_neumann(
@@ -156,7 +156,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::y1( x );
 #else
             return boost::math::cyl_neumann(
@@ -180,7 +180,7 @@ namespace bessel
             result const & x
         )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::yn(
                 n,
                 x
diff --git a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp
index 2f32967fd3..ec3d7c7a46 100644
--- a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp
@@ -39,7 +39,7 @@ struct Float2int_ru<double>
 
     HDINLINE result operator( )(double value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__double2int_ru( value );
 #else
         return static_cast<int>(ceil(value));
@@ -54,7 +54,7 @@ struct Float2int_rd<double>
 
     HDINLINE result operator( )(double value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__double2int_rd( value );
 #else
         return static_cast<int>(floor(value));
@@ -69,7 +69,7 @@ struct Float2int_rn<double>
 
     HDINLINE result operator( )(double value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__double2int_rn( value );
 #else
         if(value < 0.0)
diff --git a/include/pmacc/algorithms/math/doubleMath/modf.tpp b/include/pmacc/algorithms/math/doubleMath/modf.tpp
index 92ec4741da..b1532568c2 100644
--- a/include/pmacc/algorithms/math/doubleMath/modf.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/modf.tpp
@@ -36,7 +36,7 @@ struct Modf<double>
 
     HDINLINE double operator()(double value, double* intpart)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::modf(value, intpart);
 #else
         return std::modf(value, intpart);
diff --git a/include/pmacc/algorithms/math/floatMath/bessel.tpp b/include/pmacc/algorithms/math/floatMath/bessel.tpp
index e627ee012e..15554587d6 100644
--- a/include/pmacc/algorithms/math/floatMath/bessel.tpp
+++ b/include/pmacc/algorithms/math/floatMath/bessel.tpp
@@ -39,7 +39,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::cyl_bessel_i0f( x );
 #else
             return boost::math::cyl_bessel_i(
@@ -57,7 +57,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::cyl_bessel_i1f( x );
 #else
             return boost::math::cyl_bessel_i(
@@ -75,7 +75,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu_
             return ::j0f( x );
 #else
             return boost::math::cyl_bessel_j(
@@ -93,7 +93,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::j1f( x );
 #else
             return boost::math::cyl_bessel_j(
@@ -117,7 +117,7 @@ namespace bessel
             result const & x
         )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::jnf(
                 n,
                 x
@@ -138,7 +138,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::y0f( x );
 #else
             return boost::math::cyl_neumann(
@@ -156,7 +156,7 @@ namespace bessel
 
         HDINLINE result operator( )( result const & x )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::y1f( x );
 #else
             return boost::math::cyl_neumann(
@@ -180,7 +180,7 @@ namespace bessel
             result const & x
         )
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::ynf(
                 n,
                 x
diff --git a/include/pmacc/algorithms/math/floatMath/exp.tpp b/include/pmacc/algorithms/math/floatMath/exp.tpp
index 772dcf87a9..97ae7e0d13 100644
--- a/include/pmacc/algorithms/math/floatMath/exp.tpp
+++ b/include/pmacc/algorithms/math/floatMath/exp.tpp
@@ -38,7 +38,7 @@ namespace math
 
         HDINLINE float operator( )(const float& value)
         {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
             return ::log10f( value );
 #else
             return ::log10( value );
diff --git a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp
index 206b0118f1..681f33e21a 100644
--- a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp
+++ b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp
@@ -39,7 +39,7 @@ struct Float2int_ru<float>
 
     HDINLINE result operator( )(float value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__float2int_ru( value );
 #else
         return static_cast<int>(ceil(value));
@@ -54,7 +54,7 @@ struct Float2int_rd<float>
 
     HDINLINE result operator( )(float value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__float2int_rd( value );
 #else
         return static_cast<int>(floor(value));
@@ -69,7 +69,7 @@ struct Float2int_rn<float>
 
     HDINLINE result operator( )(float value)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::__float2int_rn( value );
 #else
         if(value < 0.0f)
diff --git a/include/pmacc/algorithms/math/floatMath/modf.tpp b/include/pmacc/algorithms/math/floatMath/modf.tpp
index d2678d179e..59efffd3ae 100644
--- a/include/pmacc/algorithms/math/floatMath/modf.tpp
+++ b/include/pmacc/algorithms/math/floatMath/modf.tpp
@@ -36,7 +36,7 @@ struct Modf<float>
 
     HDINLINE float operator()(float value, float* intpart)
     {
-#if __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
         return ::modff(value, intpart);
 #else
         return std::modf(value, intpart);
diff --git a/include/pmacc/math/ConstVector.hpp b/include/pmacc/math/ConstVector.hpp
index b1fbad1b0a..b2b4df31de 100644
--- a/include/pmacc/math/ConstVector.hpp
+++ b/include/pmacc/math/ConstVector.hpp
@@ -26,13 +26,13 @@
 #include "pmacc/types.hpp"
 
 /* select namespace depending on __CUDA_ARCH__ compiler flag*/
-#ifdef __CUDA_ARCH__ //we are on gpu
+#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu
 #   define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_device,id)
 #else
 #   define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_host,id)
 #endif
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || BOOST_COMP_HIP
 #   define PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id,Name,Type,...)                \
         namespace PMACC_JOIN(pmacc_static_const_vector_device,id)                  \
         {                                                                          \
@@ -87,7 +87,7 @@ namespace PMACC_JOIN(pmacc_static_const_storage,id)                            \
 } /* namespace pmacc_static_const_storage + id */                              \
 using namespace PMACC_JOIN(pmacc_static_const_storage,id)
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || BOOST_COMP_HIP
 #   define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name,id)                \
         namespace PMACC_JOIN(pmacc_static_const_vector_device,id)              \
         {                                                                      \
diff --git a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp
index 031aae887b..aabb323990 100644
--- a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp
+++ b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp
@@ -22,7 +22,7 @@
 
 #pragma once
 
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
 #   include <mallocMC/mallocMC.hpp>
 #endif
 #include "pmacc/particles/frame_types.hpp"
@@ -97,7 +97,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
         const int maxTries = 13; //magic number is not performance critical
         for ( int numTries = 0; numTries < maxTries; ++numTries )
         {
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
             tmp = (FrameType*) m_deviceHeapHandle.malloc( acc, sizeof (FrameType) );
 #else
             tmp = new FrameType;
@@ -107,7 +107,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
                 /* disable all particles since we can not assume that newly allocated memory contains zeros */
                 for ( int i = 0; i < (int) math::CT::volume<typename FrameType::SuperCellSize>::type::value; ++i )
                     ( *tmp )[i][multiMask_] = 0;
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
                 /* takes care that changed values are visible to all threads inside this block*/
                 __threadfence_block( );
 #endif
@@ -115,10 +115,12 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
             }
             else
             {
+#ifndef BOOST_COMP_HIP
                 printf( "%s: mallocMC out of memory (try %i of %i)\n",
                         (numTries + 1) == maxTries ? "ERROR" : "WARNING",
                         numTries + 1,
                         maxTries );
+#endif
             }
         }
 
@@ -133,7 +135,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
     template<typename T_Acc>
     DINLINE void removeFrame( const T_Acc & acc, FramePtr& frame )
     {
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         m_deviceHeapHandle.free( acc, (void*) frame.ptr );
 #else
         delete(frame.ptr);
@@ -144,14 +146,14 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
     HDINLINE
     FramePtr mapPtr( const FramePtr& devPtr ) const
     {
-#ifndef __CUDA_ARCH__
+#if( CUPLA_DEVICE_COMPILE == 1)
+        return devPtr;
+#else
         int64_t useOffset = hostMemoryOffset * static_cast<int64_t> (devPtr.ptr != 0);
         return FramePtr( reinterpret_cast<FrameType*> (
                                                        reinterpret_cast<char*> (devPtr.ptr) - useOffset
                                                        )
                         );
-#else
-        return devPtr;
 #endif
     }
 
@@ -218,7 +220,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
 
         frame->previousFrame = FramePtr( );
         frame->nextFrame = FramePtr( *firstFrameNativPtr );
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         /* - takes care that `next[index]` is visible to all threads on the gpu
          * - this is needed because later on in this method we change `previous`
          *   of an other frame, this must be done in order!
@@ -267,7 +269,7 @@ class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
 
         frame->nextFrame = FramePtr( );
         frame->previousFrame = FramePtr( *lastFrameNativPtr );
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
         /* - takes care that `next[index]` is visible to all threads on the gpu
          * - this is needed because later on in this method we change `next`
          *   of an other frame, this must be done in order!
diff --git a/include/pmacc/random/methods/XorMin.hpp b/include/pmacc/random/methods/XorMin.hpp
index 1344b24310..5822897d67 100644
--- a/include/pmacc/random/methods/XorMin.hpp
+++ b/include/pmacc/random/methods/XorMin.hpp
@@ -40,14 +40,14 @@ namespace random
 namespace methods
 {
 
-#if( BOOST_LANG_CUDA || BOOST_LANG_HIP )
+#if( ALPAKA_ACC_GPU_CUDA_ENABLED || ALPAKA_ACC_GPU_HIP_ENABLED )
     //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution
     template< typename T_Acc = cupla::Acc>
     class XorMin
     {
-#if (BOOST_LANG_HIP)
+#if( BOOST_LANG_HIP )
         using NativeStateType = hiprandStateXORWOW_t;
-#elif (BOOST_LANG_CUDA)
+#elif( BOOST_LANG_CUDA )
         using NativeStateType = curandStateXORWOW_t;
 #endif
 
@@ -69,13 +69,13 @@ namespace methods
 
             DINLINE StateType( NativeStateType const & other ): d( other.d )
             {
-#if (BOOST_LANG_HIP)
+#if( BOOST_LANG_HIP )
                 auto const* nativeStateArray = other.x;
                 PMACC_STATIC_ASSERT_MSG(
                     sizeof( v ) == sizeof( other.x ),
                     Unexpected_sizes
                 );
-#elif (BOOST_LANG_CUDA)
+#elif( BOOST_LANG_CUDA )
                 auto const* nativeStateArray = other.v;
                 PMACC_STATIC_ASSERT_MSG(
                     sizeof( v ) == sizeof( other.v ),
@@ -97,10 +97,10 @@ namespace methods
         {
             NativeStateType tmpState;
 
-#if (BOOST_LANG_HIP)
-#   define define PMACC_RNG_INIT_FN hiprand_init
-#elif (BOOST_LANG_CUDA)
-#   define define PMACC_RNG_INIT_FN curand_init
+#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 )
+#   define PMACC_RNG_INIT_FN hiprand_init
+#elif( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 )
+#   define PMACC_RNG_INIT_FN curand_init
 #endif
 
             PMACC_RNG_INIT_FN(
diff --git a/include/pmacc/static_assert.hpp b/include/pmacc/static_assert.hpp
index 7f279572e0..1f730bbf36 100644
--- a/include/pmacc/static_assert.hpp
+++ b/include/pmacc/static_assert.hpp
@@ -45,7 +45,7 @@ namespace pmacc
  * @param pmacc_unique_id pre compiler unique id
  * @param pmacc_typeInfo a type that is shown in error message
  */
-#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA
+#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP
 /* device compile with clang: boost static assert can not be used
  * error is: calling a `__host__` function from `__device__`
  * Therefore C++11 `static_assert` is used
diff --git a/include/pmacc/types.hpp b/include/pmacc/types.hpp
index 826219474d..7230641f36 100644
--- a/include/pmacc/types.hpp
+++ b/include/pmacc/types.hpp
@@ -33,7 +33,7 @@
 #   define PMACC_CUDA_ENABLED ALPAKA_ACC_GPU_CUDA_ENABLED
 #endif
 
-#if( PMACC_CUDA_ENABLED == 1 )
+#if( BOOST_LANG_CUDA || BOOST_COMP_HIP)
 /* include mallocMC before cupla renaming is activated, else we need the variable acc
  * to call atomic cuda functions
  */