From 7d357cb49507b7057b10890510a3e4e0d419b966 Mon Sep 17 00:00:00 2001 From: Third Party Date: Mon, 16 Mar 2020 16:34:44 +0100 Subject: [PATCH] Squashed 'thirdParty/cupla/' changes from f60a0ac72c..b059405c8b b059405c8b Merge pull request #161 from psychocoderHPC/topic-nativeCuplaUsage 580352ebef usage of cupla without cuda renaming macros 35a564c1c8 Merge pull request #159 from psychocoderHPC/fix-docuKernelCall b54c71fd15 fix `KernelWithElementLevel` documentation REVERT: f60a0ac72c Merge pull request #155 from ComputationalRadiationPhysics/dev git-subtree-dir: thirdParty/cupla git-subtree-split: b059405c8bc59e5186cc8ebce3ced7643f0103c1 --- .gitlab-ci.yml | 7 + .travis.yml | 11 + .../CUDASamples/cuplaVectorAdd/CMakeLists.txt | 67 ++++ example/CUDASamples/cuplaVectorAdd/README.md | 9 + .../cuplaVectorAdd/src/vectorAdd.cpp | 288 ++++++++++++++++++ include/cuda_to_cupla.hpp | 2 +- include/cupla.hpp | 25 ++ include/cupla/cudaToCupla/driverTypes.hpp | 57 ++-- include/cupla/device/Atomic.hpp | 176 +++++++++++ include/cupla/device/Hierarchy.hpp | 43 +++ include/cupla/device/Index.hpp | 120 ++++++++ include/cupla/device/SharedMemory.hpp | 32 ++ include/cupla/device/Synchronization.hpp | 57 ++++ include/cupla/device_functions.hpp | 27 ++ include/cupla/kernel.hpp | 9 +- 15 files changed, 888 insertions(+), 42 deletions(-) create mode 100644 example/CUDASamples/cuplaVectorAdd/CMakeLists.txt create mode 100644 example/CUDASamples/cuplaVectorAdd/README.md create mode 100644 example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp create mode 100644 include/cupla.hpp create mode 100644 include/cupla/device/Atomic.hpp create mode 100644 include/cupla/device/Hierarchy.hpp create mode 100644 include/cupla/device/Index.hpp create mode 100644 include/cupla/device/SharedMemory.hpp create mode 100644 include/cupla/device/Synchronization.hpp create mode 100644 include/cupla/device_functions.hpp diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 73190f9ab3c..df72c681bbe 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -55,6 +55,13 @@ && cmake $cupla_DIR/example/CUDASamples/vectorAdd/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE && make -j && time ./vectorAdd 100000 + && rm -r * + && echo "###################################################" + && echo "Example cuplaVectorAdd (added elements layer)" + && echo "###################################################" + && cmake $cupla_DIR/example/CUDASamples/cuplaVectorAdd/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE + && make -j + && time ./cuplaVectorAdd 100000 && rm -r * ; done diff --git a/.travis.yml b/.travis.yml index 7f972607f5b..7a69df28685 100644 --- a/.travis.yml +++ b/.travis.yml @@ -39,6 +39,7 @@ before_install: - mkdir -p $HOME/asyncAPI - mkdir -p $HOME/asyncAPI_tuned - mkdir -p $HOME/vectorAdd + - mkdir -p $HOME/cuplaVectorAdd - mkdir -p $HOME/blackScholes - mkdir -p $HOME/test/config - export CMAKE_FLAGS="-DALPAKA_ACC_"$STRATEGY"_ENABLE=ON" @@ -157,6 +158,16 @@ script: ./vectorAdd 100000; fi ############################################################################# + # Example: cuplaVectorAdd (added elements layer) # + ############################################################################# + - cd $HOME/cuplaVectorAdd + - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/cuplaVectorAdd/ $CMAKE_FLAGS + - make + - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] || + [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then + ./cuplaVectorAdd 100000; + fi + ############################################################################# # Example: BlackScholes (adapted original) # ############################################################################# - cd $HOME/blackScholes diff --git a/example/CUDASamples/cuplaVectorAdd/CMakeLists.txt b/example/CUDASamples/cuplaVectorAdd/CMakeLists.txt new file mode 100644 index 00000000000..8411cfa57be --- /dev/null +++ b/example/CUDASamples/cuplaVectorAdd/CMakeLists.txt @@ -0,0 +1,67 @@ +# +# Copyright 2016-2020 Rene Widera, Benjamin Worpitz, Vincent Ridder +# +# This file is part of cupla. +# +# cupla is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cupla is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with cupla. +# If not, see . +# + +################################################################################ +# Required CMake version. +################################################################################ + +cmake_minimum_required(VERSION 3.11.4) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. +################################################################################ + +project(cuplaVectorAdd) + +################################################################################ +# CMake policies +# +# Search in _ROOT: +# https://cmake.org/cmake/help/v3.12/policy/CMP0074.html# +################################################################################ + +if(POLICY CMP0074) + cmake_policy(SET CMP0074 NEW) +endif() + +################################################################################ +# Find cupla +################################################################################ + +set(cupla_ROOT "$ENV{CUPLA_ROOT}" CACHE STRING "The location of the cupla library") + +list(APPEND CMAKE_MODULE_PATH "${cupla_ROOT}") +find_package(cupla REQUIRED) + + +################################################################################ +# Add library. +################################################################################ + +set(_SOURCE_DIR "src/") + +# Add all the source files in all recursive subdirectories and group them accordingly. +append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX) + +# Always add all files to the target executable build call to add them to the build project. +cupla_add_executable(${PROJECT_NAME} ${_FILES_SOURCE_CXX}) + diff --git a/example/CUDASamples/cuplaVectorAdd/README.md b/example/CUDASamples/cuplaVectorAdd/README.md new file mode 100644 index 00000000000..96a1779c729 --- /dev/null +++ b/example/CUDASamples/cuplaVectorAdd/README.md @@ -0,0 +1,9 @@ +# vector add example with native cupla interface + +This example is equal to `vectorAdd` but is not relying on the compatibility header included with (`cuda_to_cupla.hpp`) +to allow the usage of CUDA function names and types. + +CUDA prefixed functions/types are prefix with cupla instead. +CUDA functions/types those are not prefixed life in the namespace `cupla`. +Functions call need always the current used accelerator instance. +Non standard global variables like `threadIdx`, `blockDim` should be used as functions from the namespace `cupla`. \ No newline at end of file diff --git a/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp b/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp new file mode 100644 index 00000000000..4c91d53f417 --- /dev/null +++ b/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp @@ -0,0 +1,288 @@ +/* Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/** @file Vector addition: C = A + B. + * + * This sample is a very basic sample that implements element by element + * vector addition. It is the same as the sample illustrating Chapter 2 + * of the programming guide with some additions like error checking. + */ + +#include +#include //std:cout +// For the CUDA runtime routines (prefixed with "cupla_") +#include +//Timer for test purpose +#include +#include +#include +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ +struct vectorAdd { + template + ALPAKA_FN_HOST_ACC + void operator()(T_Acc const &acc, const float *A, const float *B, float *C, const int numElements) const { + int begin = cupla::blockDim(acc).x * cupla::blockIdx(acc).x * cupla::threadDim(acc).x + cupla::threadIdx(acc).x * cupla::threadDim(acc).x; + if (begin < numElements) { + int end = (begin + cupla::threadDim(acc).x < numElements) ? begin+cupla::threadDim(acc).x : numElements; + for (int i=begin; i 1e-5) + { + fprintf(stderr, "Result verification failed at element %d!\n", i); + exit(EXIT_FAILURE); + } + } + + printf("Test PASSED\n"); + + // Free device global memory + err = cuplaFree(d_A); + + if (err != cuplaSuccess) + { + fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cuplaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + err = cuplaFree(d_B); + + if (err != cuplaSuccess) + { + fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cuplaGetErrorString(err)); + exit(EXIT_FAILURE); + } + err = cuplaFree(d_C); + + if (err != cuplaSuccess) + { + fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cuplaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + // Free host memory + free(h_A); + free(h_B); + free(h_C); + + // Reset the device and exit + // cuplaDeviceReset causes the driver to clean up all state. While + // not mandatory in normal operation, it is good practice. It is also + // needed to ensure correct operation when the application is being + // profiled. Calling cuplaDeviceReset causes all profile data to be + // flushed before the application exits + err = cuplaDeviceReset(); + + if (err != cuplaSuccess) + { + fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cuplaGetErrorString(err)); + exit(EXIT_FAILURE); + } + printf("Done\n"); + + using boost::lexical_cast; + using boost::bad_lexical_cast; + std::vector args; + while (*++argv){ + try{ + args.push_back(lexical_cast(*argv)); + } + catch( const bad_lexical_cast &){ + args.push_back(0); + } + } + //run benchmartest + int first = 50000; + int last = 100000; + int stepSize= 50000; + if (args.size() >1){ + first=args[0]; + last=args[1]; + } + if (args.size()>2){ + stepSize=args[2]; + } + benchmarkTest(first, last, stepSize); + cuplaDeviceReset(); + return 0; +} + +void +benchmarkTest(int first, int last, int stepSize) +{ + + for (int numElements = first; numElements <=last ; numElements+= stepSize) { + std::cout <<"N= " < + (end-start).count() <<"ms"<. + * + */ + + +#pragma once + +#include "cupla_runtime.hpp" +#include "cupla/device_functions.hpp" diff --git a/include/cupla/cudaToCupla/driverTypes.hpp b/include/cupla/cudaToCupla/driverTypes.hpp index 087b918f96a..b1b0e924432 100644 --- a/include/cupla/cudaToCupla/driverTypes.hpp +++ b/include/cupla/cudaToCupla/driverTypes.hpp @@ -23,8 +23,10 @@ #pragma once #include "cupla/datatypes/Array.hpp" +#include "cupla/device/SharedMemory.hpp" +#include "cupla/device_functions.hpp" -#define __syncthreads(...) ::alpaka::block::sync::syncBlockThreads(acc) +#define __syncthreads(...) ::cupla::syncThreads(acc) #define cudaSuccess cuplaSuccess #define cudaErrorMemoryAllocation cuplaErrorMemoryAllocation @@ -54,7 +56,7 @@ /* cudaEventBlockingSync is a define in CUDA, hence we must remove * the old definition with the cupla enum */ -#define cudaEventBlockingSync cuplaEventBlockingSync +#define cudaEventBlockingSync cuplaEventBlockingSync #ifdef cudaEventDisableTiming #undef cudaEventDisableTiming @@ -64,14 +66,6 @@ */ #define cudaEventDisableTiming cuplaEventDisableTiming -#define sharedMem(ppName, ...) \ - __VA_ARGS__ &ppName = \ - ::alpaka::block::shared::st::allocVar<__VA_ARGS__, __COUNTER__>(acc) - -#define sharedMemExtern(ppName, ...) \ - __VA_ARGS__* ppName = \ - ::alpaka::block::shared::dyn::getMem<__VA_ARGS__>(acc) - #define cudaMemcpyKind cuplaMemcpyKind #define cudaMemcpyHostToDevice cuplaMemcpyHostToDevice #define cudaMemcpyDeviceToHost cuplaMemcpyDeviceToHost @@ -79,22 +73,11 @@ #define cudaMemcpyHostToHost cuplaMemcpyHostToHost // index renaming -#define blockIdx \ - static_cast( \ - ::alpaka::idx::getIdx<::alpaka::Grid, ::alpaka::Blocks>(acc)) -#define threadIdx \ - static_cast( \ - ::alpaka::idx::getIdx<::alpaka::Block, ::alpaka::Threads>(acc)) - -#define gridDim \ - static_cast( \ - ::alpaka::workdiv::getWorkDiv<::alpaka::Grid, ::alpaka::Blocks>(acc)) -#define blockDim \ - static_cast( \ - ::alpaka::workdiv::getWorkDiv<::alpaka::Block, ::alpaka::Threads>(acc)) -#define elemDim \ - static_cast( \ - ::alpaka::workdiv::getWorkDiv<::alpaka::Thread, ::alpaka::Elems>(acc)) +#define blockIdx cupla::blockIdx(acc) +#define threadIdx cupla::threadIdx(acc) +#define gridDim cupla::gridDim(acc) +#define blockDim cupla::blockDim(acc) +#define elemDim cupla::threadDim(acc) /** Atomic functions * @@ -106,17 +89,17 @@ * * @{ */ -#define atomicAdd(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc, __VA_ARGS__) -#define atomicSub(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Sub>(acc, __VA_ARGS__) -#define atomicMin(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Min>(acc, __VA_ARGS__) -#define atomicMax(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Max>(acc, __VA_ARGS__) -#define atomicInc(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Inc>(acc, __VA_ARGS__) -#define atomicDec(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Dec>(acc, __VA_ARGS__) -#define atomicExch(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Exch>(acc, __VA_ARGS__) -#define atomicCAS(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Cas>(acc, __VA_ARGS__) -#define atomicAnd(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::And>(acc, __VA_ARGS__) -#define atomicXor(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Xor>(acc, __VA_ARGS__) -#define atomicOr(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Or>(acc, __VA_ARGS__) +#define atomicAdd(...) cupla::atomicAdd(acc, __VA_ARGS__) +#define atomicSub(...) cupla::atomicSub(acc, __VA_ARGS__) +#define atomicMin(...) cupla::atomicMin(acc, __VA_ARGS__) +#define atomicMax(...) cupla::atomicMax(acc, __VA_ARGS__) +#define atomicInc(...) cupla::atomicInc(acc, __VA_ARGS__) +#define atomicDec(...) cupla::atomicDec(acc, __VA_ARGS__) +#define atomicExch(...) cupla::atomicExch(acc, __VA_ARGS__) +#define atomicCAS(...) cupla::atomicCAS(acc, __VA_ARGS__) +#define atomicAnd(...) cupla::atomicAnd(acc, __VA_ARGS__) +#define atomicXor(...) cupla::atomicXor(acc, __VA_ARGS__) +#define atomicOr(...) cupla::atomicOr(acc, __VA_ARGS__) /** @} */ #define uint3 ::cupla::uint3 diff --git a/include/cupla/device/Atomic.hpp b/include/cupla/device/Atomic.hpp new file mode 100644 index 00000000000..5645a7a0f50 --- /dev/null +++ b/include/cupla/device/Atomic.hpp @@ -0,0 +1,176 @@ +/* Copyright 2020 Rene Widera + * + * This file is part of cupla. + * + * cupla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * cupla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with cupla. + * If not, see . + * + */ + + +#pragma once + +#include "cupla/datatypes/uint.hpp" +#include "cupla/device/Hierarchy.hpp" +#include "cupla/types.hpp" + +#include + +namespace cupla +{ +inline namespace device +{ + +#define CUPLA_UNARY_ATOMIC_OP(functionName, alpakaOp) \ + /*! \ + * Compared to their CUDA/HIP counterparts, these functions take an additional last \ + * parameter to denote atomicity (synchronization) level. This parameter is \ + * of type cupla::hierarchy::{Grids|Blocks|Threads}. Grids corresponds \ + * to atomicity between different kernels, Blocks - to different blocks \ + * in the same grid/kernel, Threads - to threads of the same block. \ + * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*] \ + * @tparam T_Acc alpaka accelerator [alpaka::acc::*] \ + * @tparam T_Type type of the value \ + * @param acc alpaka accelerator \ + * @param ptr destination pointer \ + * @param value source value \ + * @{ \ + */ \ + template< \ + typename T_Hierarchy, \ + typename T_Acc, \ + typename T_Type \ + > \ + ALPAKA_FN_ACC ALPAKA_FN_INLINE \ + void functionName( \ + T_Acc const & acc, \ + T_Type *ptr, \ + T_Type const & value \ + ) \ + { \ + ::alpaka::atomic::atomicOp< alpakaOp >( \ + acc, \ + ptr, \ + value, \ + T_Hierarchy{} \ + ); \ + } \ + \ + /*! @param hierarchy hierarchy level within the operation is atomic \ + */ \ + template< \ + typename T_Acc, \ + typename T_Type, \ + typename T_Hierarchy = alpaka::hierarchy::Grids \ + > \ + ALPAKA_FN_ACC ALPAKA_FN_INLINE \ + void functionName( \ + T_Acc const & acc, \ + T_Type *ptr, \ + T_Type const & value, \ + T_Hierarchy const & hierarchy = T_Hierarchy() \ + ) \ + { \ + functionName< T_Hierarchy >( \ + acc, \ + ptr, \ + value \ + ); \ + } \ + /*!@} \ + */ + + /// atomic addition + CUPLA_UNARY_ATOMIC_OP( atomicAdd, ::alpaka::atomic::op::Add ) + /// atomic subtraction + CUPLA_UNARY_ATOMIC_OP( atomicSub, ::alpaka::atomic::op::Sub ) + /// atomic minimum + CUPLA_UNARY_ATOMIC_OP( atomicMin, ::alpaka::atomic::op::Min ) + /// atomic maximum + CUPLA_UNARY_ATOMIC_OP( atomicMax, ::alpaka::atomic::op::Max ) + /// atomic increment + CUPLA_UNARY_ATOMIC_OP( atomicInc, ::alpaka::atomic::op::Inc ) + /// atomic decrement + CUPLA_UNARY_ATOMIC_OP( atomicDec, ::alpaka::atomic::op::Dec ) + /// atomic bit-wise and + CUPLA_UNARY_ATOMIC_OP( atomicAnd, ::alpaka::atomic::op::And ) + /// atomic bit-wise or + CUPLA_UNARY_ATOMIC_OP( atomicOr, ::alpaka::atomic::op::Or ) + /// atomic exchange + CUPLA_UNARY_ATOMIC_OP( atomicExch, ::alpaka::atomic::op::Exch ) + /// atomic bit-wise xor + CUPLA_UNARY_ATOMIC_OP( atomicXor, ::alpaka::atomic::op::Xor ) + +#undef CUPLA_UNARY_ATOMIC_OP + + /** atomic compare and swap + * + * @{ + * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*] + * @tparam T_Acc alpaka accelerator [alpaka::acc::*] + * @tparam T_Type type of the value + * @param acc alpaka accelerator + * @param ptr destination pointer + * @param value source value + */ + template< + typename T_Hierarchy, + typename T_Acc, + typename T_Type + > + ALPAKA_FN_ACC ALPAKA_FN_INLINE + void atomicCas( + T_Acc const & acc, + T_Type *ptr, + T_Type const & compare, + T_Type const & value + ) + { + ::alpaka::atomic::atomicOp< ::alpaka::atomic::op::Cas >( + acc, + ptr, + compare, + value, + T_Hierarchy{} + ); + } + + /*! @param hierarchy hierarchy level within the operation is atomic + */ + template< + typename T_Acc, + typename T_Type, + typename T_Hierarchy = hierarchy::Grids + > + ALPAKA_FN_ACC ALPAKA_FN_INLINE + void atomicCas( + T_Acc const & acc, + T_Type *ptr, + T_Type const & compare, + T_Type const & value, + T_Hierarchy const & hierarchy = T_Hierarchy() + ) + { + atomicCas< T_Hierarchy >( + acc, + ptr, + compare, + value + ); + } + /*!@} + */ + +} // namespace device +} // namespace cupla diff --git a/include/cupla/device/Hierarchy.hpp b/include/cupla/device/Hierarchy.hpp new file mode 100644 index 00000000000..130faa31be9 --- /dev/null +++ b/include/cupla/device/Hierarchy.hpp @@ -0,0 +1,43 @@ +/* Copyright 2020 Rene Widera + * + * This file is part of cupla. + * + * cupla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * cupla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with cupla. + * If not, see . + * + */ + + +#pragma once + +#include "cupla/types.hpp" + +#include + +namespace cupla +{ +inline namespace CUPLA_ACCELERATOR_NAMESPACE +{ +inline namespace device +{ +namespace hierarchy +{ + + //! hierarchy definitions for atomic operation + using namespace ::alpaka::hierarchy; + +} // namespace layer +} // namespace device +} // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/include/cupla/device/Index.hpp b/include/cupla/device/Index.hpp new file mode 100644 index 00000000000..2f90572f4f3 --- /dev/null +++ b/include/cupla/device/Index.hpp @@ -0,0 +1,120 @@ +/* Copyright 2020 Rene Widera + * + * This file is part of cupla. + * + * cupla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * cupla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with cupla. + * If not, see . + * + */ + + +#pragma once + +#include "cupla/datatypes/uint.hpp" +#include "cupla/types.hpp" + +#include + +namespace cupla +{ +inline namespace device +{ + + /** number of blocks within the grid layer + * + * @tparam T_Acc alpaka accelerator [alpaka::acc::*] + * @param acc alpaka accelerator + */ + template< typename T_Acc > + ALPAKA_FN_ACC ALPAKA_FN_INLINE + cupla::uint3 gridDim( T_Acc const & acc ) + { + return static_cast< uint3 >( + ::alpaka::workdiv::getWorkDiv< + ::alpaka::Grid, + ::alpaka::Blocks + >( acc ) + ); + } + + /** number of threads within the block layer + * + * @tparam T_Acc alpaka accelerator [alpaka::acc::*] + * @param acc alpaka accelerator + */ + template< typename T_Acc > + ALPAKA_FN_ACC ALPAKA_FN_INLINE + cupla::uint3 blockDim( T_Acc const & acc ) + { + return static_cast< uint3 >( + ::alpaka::workdiv::getWorkDiv< + ::alpaka::Block, + ::alpaka::Threads + >( acc ) + ); + } + + /** number of elements within the thread layer + * + * @tparam T_Acc alpaka accelerator [alpaka::acc::*] + * @param acc alpaka accelerator + */ + template< typename T_Acc > + ALPAKA_FN_ACC ALPAKA_FN_INLINE + cupla::uint3 threadDim( T_Acc const & acc ) + { + return static_cast< uint3 >( + ::alpaka::workdiv::getWorkDiv< + ::alpaka::Thread, + ::alpaka::Elems + >( acc ) + ); + } + + /** index of the thread within the block layer + * + * @tparam T_Acc alpaka accelerator [alpaka::acc::*] + * @param acc alpaka accelerator + */ + template< typename T_Acc > + ALPAKA_FN_ACC ALPAKA_FN_INLINE + cupla::uint3 threadIdx( T_Acc const & acc ) + { + return static_cast< uint3 >( + ::alpaka::idx::getIdx< + ::alpaka::Block, + ::alpaka::Threads + >( acc ) + ); + } + + /** index of the block within the grid layer + * + * @tparam T_Acc alpaka accelerator [alpaka::acc::*] + * @param acc alpaka accelerator + */ + template< typename T_Acc > + ALPAKA_FN_ACC ALPAKA_FN_INLINE + cupla::uint3 blockIdx( T_Acc const & acc ) + { + return static_cast< uint3 >( + ::alpaka::idx::getIdx< + ::alpaka::Grid, + ::alpaka::Blocks + >( acc ) + ); + } + +} // namespace device +} // namespace cupla diff --git a/include/cupla/device/SharedMemory.hpp b/include/cupla/device/SharedMemory.hpp new file mode 100644 index 00000000000..a04169eb22f --- /dev/null +++ b/include/cupla/device/SharedMemory.hpp @@ -0,0 +1,32 @@ +/* Copyright 2020 Rene Widera + * + * This file is part of cupla. + * + * cupla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * cupla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with cupla. + * If not, see . + * + */ + + +#pragma once + +#include + +#define sharedMem(ppName, ...) \ + __VA_ARGS__& ppName = \ + ::alpaka::block::shared::st::allocVar< __VA_ARGS__, __COUNTER__ >( acc ) + +#define sharedMemExtern(ppName, ...) \ + __VA_ARGS__* ppName = \ + ::alpaka::block::shared::dyn::getMem< __VA_ARGS__ >( acc ) diff --git a/include/cupla/device/Synchronization.hpp b/include/cupla/device/Synchronization.hpp new file mode 100644 index 00000000000..e24e37b4624 --- /dev/null +++ b/include/cupla/device/Synchronization.hpp @@ -0,0 +1,57 @@ +/* Copyright 2020 Rene Widera + * + * This file is part of cupla. + * + * cupla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * cupla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with cupla. + * If not, see . + * + */ + + +#pragma once + +#include "cupla/types.hpp" + +#include + +namespace cupla +{ +inline namespace device +{ + + /** synchronize threads within the block + * + * @tparam T_Acc alpaka accelerator [alpaka::acc::*] + * @param acc alpaka accelerator + * + * @{ + */ + template< typename T_Acc > + ALPAKA_FN_ACC ALPAKA_FN_INLINE + void syncThreads( T_Acc const & acc ) + { + ::alpaka::block::sync::syncBlockThreads( acc ); + } + + template< typename T_Acc > + ALPAKA_FN_ACC ALPAKA_FN_INLINE + void __syncthreads( T_Acc const & acc ) + { + syncThreads( acc ); + } + + //!@} + +} // namespace device +} // namespace cupla diff --git a/include/cupla/device_functions.hpp b/include/cupla/device_functions.hpp new file mode 100644 index 00000000000..56fc6af01d2 --- /dev/null +++ b/include/cupla/device_functions.hpp @@ -0,0 +1,27 @@ +/* Copyright 2020 Rene Widera + * + * This file is part of cupla. + * + * cupla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * cupla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with cupla. + * If not, see . + * + */ + + +#pragma once + +#include "cupla/device/Synchronization.hpp" +#include "cupla/device/Index.hpp" +#include "cupla/device/Atomic.hpp" +#include "cupla/device/SharedMemory.hpp" diff --git a/include/cupla/kernel.hpp b/include/cupla/kernel.hpp index d0d27506f7c..df287f170f4 100644 --- a/include/cupla/kernel.hpp +++ b/include/cupla/kernel.hpp @@ -172,6 +172,10 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE /* Kernel configuration interface with element support * * The kernel must support the alpaka element layer. + * + * Swap the blockSize and the elemSize depending on the activated accelerator. + * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) ) + * and the elemSize is set to the user defined blockSize */ template< typename T_KernelType @@ -201,11 +205,8 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE }; /** Kernel configuration interface with element support - * The kernel must support the alpaka element level * - * Swap the blockSize and the elemSize depending on the activated accelerator. - * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) ) - * and the elemSize is set to the user defined blockSize + * The kernel must support the alpaka element level */ template< typename T_KernelType