From 8aefa63503dbfff92981f56da575c6a51bb4f459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Fri, 18 Sep 2020 09:14:46 +0200 Subject: [PATCH 1/2] HIP: RNG XorMin Add HIP support for random number generator XorMin. --- include/pmacc/Environment.hpp | 10 ++++- include/pmacc/random/methods/XorMin.hpp | 51 +++++++++++++++++++------ 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/include/pmacc/Environment.hpp b/include/pmacc/Environment.hpp index b35facbab4..7fe06a5554 100644 --- a/include/pmacc/Environment.hpp +++ b/include/pmacc/Environment.hpp @@ -490,9 +490,15 @@ namespace detail const int tryDeviceId = (deviceOffset + deviceNumber) % num_gpus; log("Trying to allocate device %1%.") % tryDeviceId; -#if (PMACC_CUDA_ENABLED == 1) + +#if(BOOST_LANG_CUDA || BOOST_LANG_HIP) +# if(BOOST_LANG_CUDA) cudaDeviceProp devProp; - CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&devProp, tryDeviceId)); +# elif(BOOST_LANG_HIP) + hipDeviceProp_t devProp; +# endif + + CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId)); /* If the cuda gpu compute mode is 'default' * (https://docs.nvidia.com/cuda/cuda-c-programming-guide/#compute-modes) diff --git a/include/pmacc/random/methods/XorMin.hpp b/include/pmacc/random/methods/XorMin.hpp index 10850befcf..1344b24310 100644 --- a/include/pmacc/random/methods/XorMin.hpp +++ b/include/pmacc/random/methods/XorMin.hpp @@ -24,10 +24,12 @@ #include "pmacc/types.hpp" #include "pmacc/static_assert.hpp" -#if( PMACC_CUDA_ENABLED != 1 ) -# include "pmacc/random/methods/AlpakaRand.hpp" -#else +#if( BOOST_LANG_CUDA ) # include +#elif( BOOST_LANG_HIP ) +# include +#else +# include "pmacc/random/methods/AlpakaRand.hpp" #endif @@ -38,15 +40,17 @@ namespace random namespace methods { -#if( PMACC_CUDA_ENABLED != 1 ) - //! fallback to alpaka RNG if a cpu accelerator is used - template< typename T_Acc = cupla::Acc> - using XorMin = AlpakaRand< T_Acc >; -#else +#if( BOOST_LANG_CUDA || BOOST_LANG_HIP ) //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution template< typename T_Acc = cupla::Acc> class XorMin { +#if (BOOST_LANG_HIP) + using NativeStateType = hiprandStateXORWOW_t; +#elif (BOOST_LANG_CUDA) + using NativeStateType = curandStateXORWOW_t; +#endif + public: class StateType { @@ -63,14 +67,23 @@ namespace methods HDINLINE StateType( ) { } - DINLINE StateType( curandStateXORWOW_t const & other ): d( other.d ) + DINLINE StateType( NativeStateType const & other ): d( other.d ) { +#if (BOOST_LANG_HIP) + auto const* nativeStateArray = other.x; + PMACC_STATIC_ASSERT_MSG( + sizeof( v ) == sizeof( other.x ), + Unexpected_sizes + ); +#elif (BOOST_LANG_CUDA) + auto const* nativeStateArray = other.v; PMACC_STATIC_ASSERT_MSG( sizeof( v ) == sizeof( other.v ), Unexpected_sizes ); +#endif for( unsigned i = 0; i < sizeof( v ) / sizeof( v[ 0 ] ); i++ ) - v[ i ] = other.v[ i ]; + v[ i ] = nativeStateArray[ i ]; } }; @@ -82,13 +95,23 @@ namespace methods uint32_t subsequence = 0 ) const { - curandStateXORWOW_t tmpState; - curand_init( + NativeStateType tmpState; + +#if (BOOST_LANG_HIP) +# define define PMACC_RNG_INIT_FN hiprand_init +#elif (BOOST_LANG_CUDA) +# define define PMACC_RNG_INIT_FN curand_init +#endif + + PMACC_RNG_INIT_FN( seed, subsequence, 0, &tmpState ); + +#undef PMACC_RNG_INIT_FN + state = tmpState; } @@ -132,6 +155,10 @@ namespace methods return "XorMin"; } }; +#else + //! fallback to alpaka RNG if a cpu accelerator is used + template< typename T_Acc = cupla::Acc> + using XorMin = AlpakaRand< T_Acc >; #endif } // namespace methods } // namespace random From a27b90e48ffc81092271a680aa207879ead07936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Fri, 18 Sep 2020 09:16:35 +0200 Subject: [PATCH 2/2] HIP compatibility Increase HIP compatibility to PMacc and PIConGPU. --- include/picongpu/_defaultParam.loader | 2 +- .../fields/currentDeposition/Strategy.def | 14 +++++++++++++ include/picongpu/particles/Particles.hpp | 2 +- .../picongpu/particles/ParticlesFunctors.hpp | 2 +- .../simulation/control/MySimulation.hpp | 12 +++++++---- include/pmacc/Environment.hpp | 15 +++++++------- include/pmacc/PMaccConfig.cmake | 2 +- .../algorithms/math/doubleMath/bessel.tpp | 16 +++++++-------- .../math/doubleMath/floatingPoint.tpp | 6 +++--- .../pmacc/algorithms/math/doubleMath/modf.tpp | 2 +- .../algorithms/math/floatMath/bessel.tpp | 16 +++++++-------- .../pmacc/algorithms/math/floatMath/exp.tpp | 2 +- .../math/floatMath/floatingPoint.tpp | 6 +++--- .../pmacc/algorithms/math/floatMath/modf.tpp | 2 +- include/pmacc/math/ConstVector.hpp | 6 +++--- .../particles/memory/boxes/ParticlesBox.hpp | 20 ++++++++++--------- include/pmacc/random/methods/XorMin.hpp | 18 ++++++++--------- include/pmacc/static_assert.hpp | 2 +- include/pmacc/types.hpp | 2 +- 19 files changed, 83 insertions(+), 64 deletions(-) diff --git a/include/picongpu/_defaultParam.loader b/include/picongpu/_defaultParam.loader index c9d3b14f75..04c53a8c58 100644 --- a/include/picongpu/_defaultParam.loader +++ b/include/picongpu/_defaultParam.loader @@ -26,7 +26,7 @@ #pragma once #include "picongpu/param/dimension.param" -#if( PMACC_CUDA_ENABLED == 1 ) +#if(BOOST_LANG_CUDA || BOOST_COMP_HIP) # include "picongpu/param/mallocMC.param" #endif #include "picongpu/param/memory.param" diff --git a/include/picongpu/fields/currentDeposition/Strategy.def b/include/picongpu/fields/currentDeposition/Strategy.def index 6950665ccd..f4963bea30 100644 --- a/include/picongpu/fields/currentDeposition/Strategy.def +++ b/include/picongpu/fields/currentDeposition/Strategy.def @@ -143,6 +143,20 @@ namespace traits alpaka::acc::AccGpuCudaRt< T_Args... > > { + // GPU Utilization is higher compared to `StridedCachedSupercells` + using type = strategy::CachedSupercells; + }; +#endif + +#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 ) + template< + typename ... T_Args + > + struct GetDefaultStrategy< + alpaka::acc::AccGpuHipRt< T_Args... > + > + { + // GPU Utilization is higher compared to `StridedCachedSupercells` using type = strategy::CachedSupercells; }; #endif diff --git a/include/picongpu/particles/Particles.hpp b/include/picongpu/particles/Particles.hpp index 08aaccb71d..eda48ded67 100644 --- a/include/picongpu/particles/Particles.hpp +++ b/include/picongpu/particles/Particles.hpp @@ -49,7 +49,7 @@ namespace picongpu { using namespace pmacc; -#if( PMACC_CUDA_ENABLED != 1 ) +#if(!BOOST_LANG_CUDA && !BOOST_COMP_HIP) /* dummy because we are not using mallocMC with cupla * DeviceHeap is defined in `mallocMC.param` */ diff --git a/include/picongpu/particles/ParticlesFunctors.hpp b/include/picongpu/particles/ParticlesFunctors.hpp index 103aa436f9..4630bfcf96 100644 --- a/include/picongpu/particles/ParticlesFunctors.hpp +++ b/include/picongpu/particles/ParticlesFunctors.hpp @@ -125,7 +125,7 @@ struct LogMemoryStatisticsForSpecies const std::shared_ptr& deviceHeap ) const { -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) log("mallocMC: free slots for species %3%: %1% a %2%") % deviceHeap->getAvailableSlots( cupla::manager::Device< cupla::AccDev >::get().current(), diff --git a/include/picongpu/simulation/control/MySimulation.hpp b/include/picongpu/simulation/control/MySimulation.hpp index cc12a59768..57395bab49 100644 --- a/include/picongpu/simulation/control/MySimulation.hpp +++ b/include/picongpu/simulation/control/MySimulation.hpp @@ -373,7 +373,9 @@ class MySimulation : public SimulationHelper this->bremsstrahlungPhotonAngle.init(); } +#endif +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) auto nativeCudaStream = cupla::manager::Stream< cupla::AccDev, cupla::AccStream @@ -425,7 +427,7 @@ class MySimulation : public SimulationHelper throw std::runtime_error(msg.str()); } -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) size_t heapSize = freeGpuMem - reservedGpuMemorySize; if( Environment<>::get().MemoryInfo().isSharedMemoryPool() ) @@ -443,10 +445,12 @@ class MySimulation : public SimulationHelper heapSize ); cuplaStreamSynchronize( 0 ); - - auto mallocMCBuffer = std::make_unique< MallocMCBuffer >( deviceHeap ); +# if( PMACC_CUDA_ENABLED == 1 ) + auto mallocMCBuffer = std::make_unique< MallocMCBuffer< DeviceHeap > >( deviceHeap ); dc.consume( std::move( mallocMCBuffer ) ); +# endif #endif + meta::ForEach< VectorAllSpecies, particles::LogMemoryStatisticsForSpecies > logMemoryStatisticsForSpecies; logMemoryStatisticsForSpecies( deviceHeap ); @@ -455,7 +459,7 @@ class MySimulation : public SimulationHelper IdProvider::init(); -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* add CUDA streams to the StreamController for concurrent execution */ Environment<>::get().StreamController().addStreams(6); #endif diff --git a/include/pmacc/Environment.hpp b/include/pmacc/Environment.hpp index 7fe06a5554..7d222d65d2 100644 --- a/include/pmacc/Environment.hpp +++ b/include/pmacc/Environment.hpp @@ -468,7 +468,7 @@ namespace detail { int num_gpus = 0; //number of gpus cuplaGetDeviceCount(&num_gpus); -#if (PMACC_CUDA_ENABLED == 1) +#if(BOOST_LANG_CUDA|| BOOST_COMP_HIP) //##ERROR handling if (num_gpus < 1) //check if cupla device is found { @@ -506,7 +506,7 @@ namespace detail * The index used to select a device is based on the local MPI rank so * that each rank tries a different device. */ - if (devProp.computeMode == cudaComputeModeDefault) + if (devProp.computeMode == ALPAKA_API_PREFIX(ComputeModeDefault)) { maxTries = 1; log("Device %1% is running in default mode.") % tryDeviceId; @@ -532,18 +532,17 @@ namespace detail if (rc == cuplaSuccess) { -#if (PMACC_CUDA_ENABLED == 1) - cudaDeviceProp dprop; - CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&dprop, tryDeviceId)); - log ("Set device to %1%: %2%") % tryDeviceId % dprop.name; - if(cudaErrorSetOnActiveProcess == cudaSetDeviceFlags(cudaDeviceScheduleSpin)) +#if(BOOST_LANG_CUDA || BOOST_LANG_HIP) + CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId)); + log ("Set device to %1%: %2%") % tryDeviceId % devProp.name; + if(ALPAKA_API_PREFIX(ErrorSetOnActiveProcess) == ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin))) { cuplaGetLastError(); //reset all errors /* - because of cuplaStreamCreate was called cuplaSetDeviceFlags crashed * - to set the flags reset the device and set flags again */ CUDA_CHECK(cuplaDeviceReset()); - CUDA_CHECK((cuplaError_t)cudaSetDeviceFlags(cudaDeviceScheduleSpin)); + CUDA_CHECK((cuplaError_t)ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin))); } #endif CUDA_CHECK(cuplaGetLastError()); diff --git a/include/pmacc/PMaccConfig.cmake b/include/pmacc/PMaccConfig.cmake index 2878eacee3..2c0a14e481 100644 --- a/include/pmacc/PMaccConfig.cmake +++ b/include/pmacc/PMaccConfig.cmake @@ -368,7 +368,7 @@ endif() # Find mallocMC ################################################################################ -if(ALPAKA_ACC_GPU_CUDA_ENABLE) +if(ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE) set(mallocMC_ALPAKA_PROVIDER "extern" CACHE STRING "Select which alpaka is used for mallocMC") find_package(mallocMC 2.5.0 QUIET) diff --git a/include/pmacc/algorithms/math/doubleMath/bessel.tpp b/include/pmacc/algorithms/math/doubleMath/bessel.tpp index c6f4af59be..b099bef2f8 100644 --- a/include/pmacc/algorithms/math/doubleMath/bessel.tpp +++ b/include/pmacc/algorithms/math/doubleMath/bessel.tpp @@ -39,7 +39,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::cyl_bessel_i0( x ); #else return boost::math::cyl_bessel_i( @@ -57,7 +57,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::cyl_bessel_i1( x ); #else return boost::math::cyl_bessel_i( @@ -75,7 +75,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::j0( x ); #else return boost::math::cyl_bessel_j( @@ -93,7 +93,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::j1( x ); #else return boost::math::cyl_bessel_j( @@ -117,7 +117,7 @@ namespace bessel result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::jn( n, x @@ -138,7 +138,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::y0( x ); #else return boost::math::cyl_neumann( @@ -156,7 +156,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::y1( x ); #else return boost::math::cyl_neumann( @@ -180,7 +180,7 @@ namespace bessel result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::yn( n, x diff --git a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp index 2f32967fd3..ec3d7c7a46 100644 --- a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp +++ b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp @@ -39,7 +39,7 @@ struct Float2int_ru HDINLINE result operator( )(double value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__double2int_ru( value ); #else return static_cast(ceil(value)); @@ -54,7 +54,7 @@ struct Float2int_rd HDINLINE result operator( )(double value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__double2int_rd( value ); #else return static_cast(floor(value)); @@ -69,7 +69,7 @@ struct Float2int_rn HDINLINE result operator( )(double value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__double2int_rn( value ); #else if(value < 0.0) diff --git a/include/pmacc/algorithms/math/doubleMath/modf.tpp b/include/pmacc/algorithms/math/doubleMath/modf.tpp index 92ec4741da..b1532568c2 100644 --- a/include/pmacc/algorithms/math/doubleMath/modf.tpp +++ b/include/pmacc/algorithms/math/doubleMath/modf.tpp @@ -36,7 +36,7 @@ struct Modf HDINLINE double operator()(double value, double* intpart) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::modf(value, intpart); #else return std::modf(value, intpart); diff --git a/include/pmacc/algorithms/math/floatMath/bessel.tpp b/include/pmacc/algorithms/math/floatMath/bessel.tpp index e627ee012e..15554587d6 100644 --- a/include/pmacc/algorithms/math/floatMath/bessel.tpp +++ b/include/pmacc/algorithms/math/floatMath/bessel.tpp @@ -39,7 +39,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::cyl_bessel_i0f( x ); #else return boost::math::cyl_bessel_i( @@ -57,7 +57,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::cyl_bessel_i1f( x ); #else return boost::math::cyl_bessel_i( @@ -75,7 +75,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu_ return ::j0f( x ); #else return boost::math::cyl_bessel_j( @@ -93,7 +93,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::j1f( x ); #else return boost::math::cyl_bessel_j( @@ -117,7 +117,7 @@ namespace bessel result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::jnf( n, x @@ -138,7 +138,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::y0f( x ); #else return boost::math::cyl_neumann( @@ -156,7 +156,7 @@ namespace bessel HDINLINE result operator( )( result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::y1f( x ); #else return boost::math::cyl_neumann( @@ -180,7 +180,7 @@ namespace bessel result const & x ) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::ynf( n, x diff --git a/include/pmacc/algorithms/math/floatMath/exp.tpp b/include/pmacc/algorithms/math/floatMath/exp.tpp index 772dcf87a9..97ae7e0d13 100644 --- a/include/pmacc/algorithms/math/floatMath/exp.tpp +++ b/include/pmacc/algorithms/math/floatMath/exp.tpp @@ -38,7 +38,7 @@ namespace math HDINLINE float operator( )(const float& value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::log10f( value ); #else return ::log10( value ); diff --git a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp index 206b0118f1..681f33e21a 100644 --- a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp +++ b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp @@ -39,7 +39,7 @@ struct Float2int_ru HDINLINE result operator( )(float value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__float2int_ru( value ); #else return static_cast(ceil(value)); @@ -54,7 +54,7 @@ struct Float2int_rd HDINLINE result operator( )(float value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__float2int_rd( value ); #else return static_cast(floor(value)); @@ -69,7 +69,7 @@ struct Float2int_rn HDINLINE result operator( )(float value) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::__float2int_rn( value ); #else if(value < 0.0f) diff --git a/include/pmacc/algorithms/math/floatMath/modf.tpp b/include/pmacc/algorithms/math/floatMath/modf.tpp index d2678d179e..59efffd3ae 100644 --- a/include/pmacc/algorithms/math/floatMath/modf.tpp +++ b/include/pmacc/algorithms/math/floatMath/modf.tpp @@ -36,7 +36,7 @@ struct Modf HDINLINE float operator()(float value, float* intpart) { -#if __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu return ::modff(value, intpart); #else return std::modf(value, intpart); diff --git a/include/pmacc/math/ConstVector.hpp b/include/pmacc/math/ConstVector.hpp index b1fbad1b0a..b2b4df31de 100644 --- a/include/pmacc/math/ConstVector.hpp +++ b/include/pmacc/math/ConstVector.hpp @@ -26,13 +26,13 @@ #include "pmacc/types.hpp" /* select namespace depending on __CUDA_ARCH__ compiler flag*/ -#ifdef __CUDA_ARCH__ //we are on gpu +#if( CUPLA_DEVICE_COMPILE == 1) //we are on gpu # define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_device,id) #else # define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_host,id) #endif -#ifdef __CUDACC__ +#if defined(__CUDACC__) || BOOST_COMP_HIP # define PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id,Name,Type,...) \ namespace PMACC_JOIN(pmacc_static_const_vector_device,id) \ { \ @@ -87,7 +87,7 @@ namespace PMACC_JOIN(pmacc_static_const_storage,id) \ } /* namespace pmacc_static_const_storage + id */ \ using namespace PMACC_JOIN(pmacc_static_const_storage,id) -#ifdef __CUDACC__ +#if defined(__CUDACC__) || BOOST_COMP_HIP # define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name,id) \ namespace PMACC_JOIN(pmacc_static_const_vector_device,id) \ { \ diff --git a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp index 031aae887b..aabb323990 100644 --- a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp +++ b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp @@ -22,7 +22,7 @@ #pragma once -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) # include #endif #include "pmacc/particles/frame_types.hpp" @@ -97,7 +97,7 @@ class ParticlesBox : protected DataBox, DIM> > const int maxTries = 13; //magic number is not performance critical for ( int numTries = 0; numTries < maxTries; ++numTries ) { -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) tmp = (FrameType*) m_deviceHeapHandle.malloc( acc, sizeof (FrameType) ); #else tmp = new FrameType; @@ -107,7 +107,7 @@ class ParticlesBox : protected DataBox, DIM> > /* disable all particles since we can not assume that newly allocated memory contains zeros */ for ( int i = 0; i < (int) math::CT::volume::type::value; ++i ) ( *tmp )[i][multiMask_] = 0; -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* takes care that changed values are visible to all threads inside this block*/ __threadfence_block( ); #endif @@ -115,10 +115,12 @@ class ParticlesBox : protected DataBox, DIM> > } else { +#ifndef BOOST_COMP_HIP printf( "%s: mallocMC out of memory (try %i of %i)\n", (numTries + 1) == maxTries ? "ERROR" : "WARNING", numTries + 1, maxTries ); +#endif } } @@ -133,7 +135,7 @@ class ParticlesBox : protected DataBox, DIM> > template DINLINE void removeFrame( const T_Acc & acc, FramePtr& frame ) { -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) m_deviceHeapHandle.free( acc, (void*) frame.ptr ); #else delete(frame.ptr); @@ -144,14 +146,14 @@ class ParticlesBox : protected DataBox, DIM> > HDINLINE FramePtr mapPtr( const FramePtr& devPtr ) const { -#ifndef __CUDA_ARCH__ +#if( CUPLA_DEVICE_COMPILE == 1) + return devPtr; +#else int64_t useOffset = hostMemoryOffset * static_cast (devPtr.ptr != 0); return FramePtr( reinterpret_cast ( reinterpret_cast (devPtr.ptr) - useOffset ) ); -#else - return devPtr; #endif } @@ -218,7 +220,7 @@ class ParticlesBox : protected DataBox, DIM> > frame->previousFrame = FramePtr( ); frame->nextFrame = FramePtr( *firstFrameNativPtr ); -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* - takes care that `next[index]` is visible to all threads on the gpu * - this is needed because later on in this method we change `previous` * of an other frame, this must be done in order! @@ -267,7 +269,7 @@ class ParticlesBox : protected DataBox, DIM> > frame->nextFrame = FramePtr( ); frame->previousFrame = FramePtr( *lastFrameNativPtr ); -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* - takes care that `next[index]` is visible to all threads on the gpu * - this is needed because later on in this method we change `next` * of an other frame, this must be done in order! diff --git a/include/pmacc/random/methods/XorMin.hpp b/include/pmacc/random/methods/XorMin.hpp index 1344b24310..5822897d67 100644 --- a/include/pmacc/random/methods/XorMin.hpp +++ b/include/pmacc/random/methods/XorMin.hpp @@ -40,14 +40,14 @@ namespace random namespace methods { -#if( BOOST_LANG_CUDA || BOOST_LANG_HIP ) +#if( ALPAKA_ACC_GPU_CUDA_ENABLED || ALPAKA_ACC_GPU_HIP_ENABLED ) //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution template< typename T_Acc = cupla::Acc> class XorMin { -#if (BOOST_LANG_HIP) +#if( BOOST_LANG_HIP ) using NativeStateType = hiprandStateXORWOW_t; -#elif (BOOST_LANG_CUDA) +#elif( BOOST_LANG_CUDA ) using NativeStateType = curandStateXORWOW_t; #endif @@ -69,13 +69,13 @@ namespace methods DINLINE StateType( NativeStateType const & other ): d( other.d ) { -#if (BOOST_LANG_HIP) +#if( BOOST_LANG_HIP ) auto const* nativeStateArray = other.x; PMACC_STATIC_ASSERT_MSG( sizeof( v ) == sizeof( other.x ), Unexpected_sizes ); -#elif (BOOST_LANG_CUDA) +#elif( BOOST_LANG_CUDA ) auto const* nativeStateArray = other.v; PMACC_STATIC_ASSERT_MSG( sizeof( v ) == sizeof( other.v ), @@ -97,10 +97,10 @@ namespace methods { NativeStateType tmpState; -#if (BOOST_LANG_HIP) -# define define PMACC_RNG_INIT_FN hiprand_init -#elif (BOOST_LANG_CUDA) -# define define PMACC_RNG_INIT_FN curand_init +#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 ) +# define PMACC_RNG_INIT_FN hiprand_init +#elif( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 ) +# define PMACC_RNG_INIT_FN curand_init #endif PMACC_RNG_INIT_FN( diff --git a/include/pmacc/static_assert.hpp b/include/pmacc/static_assert.hpp index 7f279572e0..1f730bbf36 100644 --- a/include/pmacc/static_assert.hpp +++ b/include/pmacc/static_assert.hpp @@ -45,7 +45,7 @@ namespace pmacc * @param pmacc_unique_id pre compiler unique id * @param pmacc_typeInfo a type that is shown in error message */ -#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA +#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP /* device compile with clang: boost static assert can not be used * error is: calling a `__host__` function from `__device__` * Therefore C++11 `static_assert` is used diff --git a/include/pmacc/types.hpp b/include/pmacc/types.hpp index 826219474d..7230641f36 100644 --- a/include/pmacc/types.hpp +++ b/include/pmacc/types.hpp @@ -33,7 +33,7 @@ # define PMACC_CUDA_ENABLED ALPAKA_ACC_GPU_CUDA_ENABLED #endif -#if( PMACC_CUDA_ENABLED == 1 ) +#if( BOOST_LANG_CUDA || BOOST_COMP_HIP) /* include mallocMC before cupla renaming is activated, else we need the variable acc * to call atomic cuda functions */