From 7d357cb49507b7057b10890510a3e4e0d419b966 Mon Sep 17 00:00:00 2001
From: Third Party <picongpu@hzdr.de>
Date: Mon, 16 Mar 2020 16:34:44 +0100
Subject: [PATCH] Squashed 'thirdParty/cupla/' changes from
 f60a0ac72c..b059405c8b

b059405c8b Merge pull request #161 from psychocoderHPC/topic-nativeCuplaUsage
580352ebef usage of cupla without cuda renaming macros
35a564c1c8 Merge pull request #159 from psychocoderHPC/fix-docuKernelCall
b54c71fd15 fix `KernelWithElementLevel` documentation
REVERT: f60a0ac72c Merge pull request #155 from ComputationalRadiationPhysics/dev

git-subtree-dir: thirdParty/cupla
git-subtree-split: b059405c8bc59e5186cc8ebce3ced7643f0103c1
---
 .gitlab-ci.yml                                |   7 +
 .travis.yml                                   |  11 +
 .../CUDASamples/cuplaVectorAdd/CMakeLists.txt |  67 ++++
 example/CUDASamples/cuplaVectorAdd/README.md  |   9 +
 .../cuplaVectorAdd/src/vectorAdd.cpp          | 288 ++++++++++++++++++
 include/cuda_to_cupla.hpp                     |   2 +-
 include/cupla.hpp                             |  25 ++
 include/cupla/cudaToCupla/driverTypes.hpp     |  57 ++--
 include/cupla/device/Atomic.hpp               | 176 +++++++++++
 include/cupla/device/Hierarchy.hpp            |  43 +++
 include/cupla/device/Index.hpp                | 120 ++++++++
 include/cupla/device/SharedMemory.hpp         |  32 ++
 include/cupla/device/Synchronization.hpp      |  57 ++++
 include/cupla/device_functions.hpp            |  27 ++
 include/cupla/kernel.hpp                      |   9 +-
 15 files changed, 888 insertions(+), 42 deletions(-)
 create mode 100644 example/CUDASamples/cuplaVectorAdd/CMakeLists.txt
 create mode 100644 example/CUDASamples/cuplaVectorAdd/README.md
 create mode 100644 example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp
 create mode 100644 include/cupla.hpp
 create mode 100644 include/cupla/device/Atomic.hpp
 create mode 100644 include/cupla/device/Hierarchy.hpp
 create mode 100644 include/cupla/device/Index.hpp
 create mode 100644 include/cupla/device/SharedMemory.hpp
 create mode 100644 include/cupla/device/Synchronization.hpp
 create mode 100644 include/cupla/device_functions.hpp

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 73190f9ab3c..df72c681bbe 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,6 +55,13 @@
         && cmake $cupla_DIR/example/CUDASamples/vectorAdd/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
         && make -j
         && time ./vectorAdd 100000
+        && rm -r *
+        && echo "###################################################"
+        && echo "Example cuplaVectorAdd (added elements layer)"
+        && echo "###################################################"
+        && cmake $cupla_DIR/example/CUDASamples/cuplaVectorAdd/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
+        && make -j
+        && time ./cuplaVectorAdd 100000
         && rm -r * ;
       done
 
diff --git a/.travis.yml b/.travis.yml
index 7f972607f5b..7a69df28685 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -39,6 +39,7 @@ before_install:
   - mkdir -p $HOME/asyncAPI
   - mkdir -p $HOME/asyncAPI_tuned
   - mkdir -p $HOME/vectorAdd
+  - mkdir -p $HOME/cuplaVectorAdd
   - mkdir -p $HOME/blackScholes
   - mkdir -p $HOME/test/config
   - export CMAKE_FLAGS="-DALPAKA_ACC_"$STRATEGY"_ENABLE=ON"
@@ -157,6 +158,16 @@ script:
       ./vectorAdd 100000;
     fi
   #############################################################################
+  # Example: cuplaVectorAdd (added elements layer)                            #
+  #############################################################################
+  - cd $HOME/cuplaVectorAdd
+  - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/cuplaVectorAdd/ $CMAKE_FLAGS
+  - make
+  - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] ||
+       [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then
+      ./cuplaVectorAdd 100000;
+    fi
+  #############################################################################
   # Example: BlackScholes (adapted original)                                 #
   #############################################################################
   - cd $HOME/blackScholes
diff --git a/example/CUDASamples/cuplaVectorAdd/CMakeLists.txt b/example/CUDASamples/cuplaVectorAdd/CMakeLists.txt
new file mode 100644
index 00000000000..8411cfa57be
--- /dev/null
+++ b/example/CUDASamples/cuplaVectorAdd/CMakeLists.txt
@@ -0,0 +1,67 @@
+#
+# Copyright 2016-2020 Rene Widera, Benjamin Worpitz, Vincent Ridder
+#
+# This file is part of cupla.
+#
+# cupla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cupla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with cupla.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+################################################################################
+# Required CMake version.
+################################################################################
+
+cmake_minimum_required(VERSION 3.11.4)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+################################################################################
+
+project(cuplaVectorAdd)
+
+################################################################################
+# CMake policies
+#
+# Search in <PackageName>_ROOT:
+#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html#
+################################################################################
+
+if(POLICY CMP0074)
+    cmake_policy(SET CMP0074 NEW)
+endif()
+
+################################################################################
+# Find cupla
+################################################################################
+
+set(cupla_ROOT "$ENV{CUPLA_ROOT}" CACHE STRING  "The location of the cupla library")
+
+list(APPEND CMAKE_MODULE_PATH "${cupla_ROOT}")
+find_package(cupla REQUIRED)
+
+
+################################################################################
+# Add library.
+################################################################################
+
+set(_SOURCE_DIR "src/")
+
+# Add all the source files in all recursive subdirectories and group them accordingly.
+append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX)
+
+# Always add all files to the target executable build call to add them to the build project.
+cupla_add_executable(${PROJECT_NAME} ${_FILES_SOURCE_CXX})
+
diff --git a/example/CUDASamples/cuplaVectorAdd/README.md b/example/CUDASamples/cuplaVectorAdd/README.md
new file mode 100644
index 00000000000..96a1779c729
--- /dev/null
+++ b/example/CUDASamples/cuplaVectorAdd/README.md
@@ -0,0 +1,9 @@
+# vector add example with native cupla interface
+
+This example is equal to `vectorAdd` but is not relying on the compatibility header included with (`cuda_to_cupla.hpp`) 
+to allow the usage of CUDA function names and types.
+
+CUDA prefixed functions/types are prefix with cupla instead.
+CUDA functions/types those are not prefixed life in the namespace `cupla`.
+Functions call need always the current used accelerator instance.
+Non standard global variables like `threadIdx`, `blockDim` should be used as functions from the namespace `cupla`.
\ No newline at end of file
diff --git a/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp b/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp
new file mode 100644
index 00000000000..4c91d53f417
--- /dev/null
+++ b/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp
@@ -0,0 +1,288 @@
+/* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/** @file Vector addition: C = A + B.
+ *
+ * This sample is a very basic sample that implements element by element
+ * vector addition. It is the same as the sample illustrating Chapter 2
+ * of the programming guide with some additions like error checking.
+ */
+
+#include <stdio.h>
+#include <iostream> //std:cout
+// For the CUDA runtime routines (prefixed with "cupla_")
+#include <cupla.hpp>
+//Timer for test purpose
+#include <chrono>
+#include <boost/lexical_cast.hpp>
+#include <vector>
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+struct vectorAdd {
+    template<typename T_Acc>
+    ALPAKA_FN_HOST_ACC
+    void operator()(T_Acc const &acc, const float *A, const float *B, float *C, const int numElements) const {
+        int begin = cupla::blockDim(acc).x * cupla::blockIdx(acc).x * cupla::threadDim(acc).x + cupla::threadIdx(acc).x * cupla::threadDim(acc).x;
+        if (begin < numElements) {
+            int end = (begin + cupla::threadDim(acc).x < numElements) ? begin+cupla::threadDim(acc).x : numElements;
+            for (int i=begin; i <end; ++i) {
+                C[i] = A[i] + B[i], cupla::hierarchy::Blocks{};
+            }
+        }
+    }
+};
+
+void benchmarkTest(int first, int last , int stepSize);
+/**
+ * Host main routine
+ */
+int
+main(int argc, char *argv[])
+{
+    // Error code to check return values for CUDA calls
+    cuplaError_t err = cuplaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    size_t size = numElements * sizeof(float);
+    printf("[Vector addition of %d elements]\n", numElements);
+
+    // Allocate the host input vector A
+    float *h_A = (float *)malloc(size);
+
+    // Allocate the host input vector B
+    float *h_B = (float *)malloc(size);
+
+    // Allocate the host output vector C
+    float *h_C = (float *)malloc(size);
+
+    // Verify that allocations succeeded
+    if (h_A == NULL || h_B == NULL || h_C == NULL)
+    {
+        fprintf(stderr, "Failed to allocate host vectors!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Initialize the host input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float)RAND_MAX;
+        h_B[i] = rand()/(float)RAND_MAX;
+    }
+
+    // Allocate the device input vector A
+    float *d_A = NULL;
+    err = cuplaMalloc((void **)&d_A, size);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Allocate the device input vector B
+    float *d_B = NULL;
+    err = cuplaMalloc((void **)&d_B, size);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Allocate the device output vector C
+    float *d_C = NULL;
+    err = cuplaMalloc((void **)&d_C, size);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Copy the host input vectors A and B in host memory to the device input vectors in
+    // device memory
+    printf("Copy input data from the host memory to the CUDA device\n");
+    err = cuplaMemcpy(d_A, h_A, size, cuplaMemcpyHostToDevice);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    err = cuplaMemcpy(d_B, h_B, size, cuplaMemcpyHostToDevice);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
+    CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock,0,0)(d_A, d_B, d_C, numElements);
+    err = cuplaGetLastError();
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Copy the device result vector in device memory to the host result vector
+    // in host memory.
+    printf("Copy output data from the CUDA device to the host memory\n");
+    err = cuplaMemcpy(h_C, d_C, size, cuplaMemcpyDeviceToHost);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Verify that the result vector is correct
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
+        {
+            fprintf(stderr, "Result verification failed at element %d!\n", i);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    printf("Test PASSED\n");
+
+    // Free device global memory
+    err = cuplaFree(d_A);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    err = cuplaFree(d_B);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+    err = cuplaFree(d_C);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Free host memory
+    free(h_A);
+    free(h_B);
+    free(h_C);
+
+    // Reset the device and exit
+    // cuplaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cuplaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    err = cuplaDeviceReset();
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+    printf("Done\n");
+
+    using boost::lexical_cast;
+    using boost::bad_lexical_cast;
+    std::vector<int> args;
+    while (*++argv){
+        try{
+            args.push_back(lexical_cast<int>(*argv));
+        }
+        catch( const bad_lexical_cast &){
+            args.push_back(0);
+        }
+    }
+    //run benchmartest
+    int first = 50000;
+    int last = 100000;
+    int stepSize= 50000;
+    if (args.size() >1){
+        first=args[0];
+        last=args[1];
+    }
+    if (args.size()>2){
+        stepSize=args[2];
+    }
+    benchmarkTest(first, last, stepSize);
+    cuplaDeviceReset();
+    return 0;
+}
+
+void
+benchmarkTest(int first, int last, int stepSize)
+{
+
+    for (int numElements = first; numElements <=last ; numElements+= stepSize) {
+        std::cout <<"N= " <<numElements << "; ";
+        size_t size = numElements * sizeof(float);
+        //alloc host memory
+        float *h_A = (float *)malloc(size);
+        float *h_B = (float *)malloc(size);
+        //init
+        for (int i = 0; i < numElements; ++i) {
+            h_A[i] = rand()/(float)RAND_MAX;
+            h_B[i] = rand()/(float)RAND_MAX;
+        }
+        //alloc device memory
+        float *d_A = NULL;
+        cuplaMalloc((void **) &d_A, size);
+        float *d_B = NULL;
+        cuplaMalloc((void **) &d_B, size);
+        float *d_C = NULL;
+        cuplaMalloc((void **) &d_C, size);
+
+        // copy host device
+        cuplaMemcpy(d_A, h_A, size, cuplaMemcpyHostToDevice);
+        cuplaMemcpy(d_B, h_B, size, cuplaMemcpyHostToDevice);
+
+        int threadsPerBlock=1024;
+        int blocksPerGrid= (numElements+threadsPerBlock-1)/threadsPerBlock;
+
+        //Run Kernel
+        std::chrono::high_resolution_clock::time_point start =
+            std::chrono::high_resolution_clock::now();
+
+        CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock, 0, 0)(d_A, d_B, d_C, numElements);
+        cuplaDeviceSynchronize();
+
+        std::chrono::high_resolution_clock::time_point end =
+                std::chrono::high_resolution_clock::now();
+
+        std::cout << "Time: "<< std::chrono::duration_cast<std::chrono::milliseconds>
+                                        (end-start).count() <<"ms"<<std::endl;
+        //Free Device memory
+        cuplaFree(d_A);
+        cuplaFree(d_B);
+        cuplaFree(d_C);
+    }
+}
+
diff --git a/include/cuda_to_cupla.hpp b/include/cuda_to_cupla.hpp
index e36a37d05b9..b3de9d86867 100644
--- a/include/cuda_to_cupla.hpp
+++ b/include/cuda_to_cupla.hpp
@@ -21,7 +21,7 @@
 
 #pragma once
 
-#include "cupla_runtime.hpp"
+#include "cupla.hpp"
 
 #include "cupla/cudaToCupla/driverTypes.hpp"
 #include "cupla/cudaToCupla/runtime.hpp"
diff --git a/include/cupla.hpp b/include/cupla.hpp
new file mode 100644
index 00000000000..85284e6d7c8
--- /dev/null
+++ b/include/cupla.hpp
@@ -0,0 +1,25 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla_runtime.hpp"
+#include "cupla/device_functions.hpp"
diff --git a/include/cupla/cudaToCupla/driverTypes.hpp b/include/cupla/cudaToCupla/driverTypes.hpp
index 087b918f96a..b1b0e924432 100644
--- a/include/cupla/cudaToCupla/driverTypes.hpp
+++ b/include/cupla/cudaToCupla/driverTypes.hpp
@@ -23,8 +23,10 @@
 #pragma once
 
 #include "cupla/datatypes/Array.hpp"
+#include "cupla/device/SharedMemory.hpp"
+#include "cupla/device_functions.hpp"
 
-#define __syncthreads(...) ::alpaka::block::sync::syncBlockThreads(acc)
+#define __syncthreads(...) ::cupla::syncThreads(acc)
 
 #define cudaSuccess cuplaSuccess
 #define cudaErrorMemoryAllocation cuplaErrorMemoryAllocation
@@ -54,7 +56,7 @@
 /* cudaEventBlockingSync is a define in CUDA, hence we must remove
  * the old definition with the cupla enum
  */
-#define cudaEventBlockingSync cuplaEventBlockingSync 
+#define cudaEventBlockingSync cuplaEventBlockingSync
 
 #ifdef cudaEventDisableTiming
 #undef cudaEventDisableTiming
@@ -64,14 +66,6 @@
  */
 #define cudaEventDisableTiming cuplaEventDisableTiming
 
-#define sharedMem(ppName, ...)                                                 \
-  __VA_ARGS__ &ppName =                                                        \
-      ::alpaka::block::shared::st::allocVar<__VA_ARGS__, __COUNTER__>(acc)
-
-#define sharedMemExtern(ppName, ...)                                           \
-    __VA_ARGS__* ppName =                                                      \
-        ::alpaka::block::shared::dyn::getMem<__VA_ARGS__>(acc)
-
 #define cudaMemcpyKind cuplaMemcpyKind
 #define cudaMemcpyHostToDevice cuplaMemcpyHostToDevice
 #define cudaMemcpyDeviceToHost cuplaMemcpyDeviceToHost
@@ -79,22 +73,11 @@
 #define cudaMemcpyHostToHost cuplaMemcpyHostToHost
 
 // index renaming
-#define blockIdx                                                               \
-  static_cast<uint3>(                                                \
-      ::alpaka::idx::getIdx<::alpaka::Grid, ::alpaka::Blocks>(acc))
-#define threadIdx                                                              \
-  static_cast<uint3>(                                                \
-      ::alpaka::idx::getIdx<::alpaka::Block, ::alpaka::Threads>(acc))
-
-#define gridDim                                                                \
-  static_cast<uint3>(                                                \
-      ::alpaka::workdiv::getWorkDiv<::alpaka::Grid, ::alpaka::Blocks>(acc))
-#define blockDim                                                               \
-  static_cast<uint3>(                                                \
-      ::alpaka::workdiv::getWorkDiv<::alpaka::Block, ::alpaka::Threads>(acc))
-#define elemDim                                                               \
-  static_cast<uint3>(                                                \
-      ::alpaka::workdiv::getWorkDiv<::alpaka::Thread, ::alpaka::Elems>(acc))
+#define blockIdx cupla::blockIdx(acc)
+#define threadIdx cupla::threadIdx(acc)
+#define gridDim cupla::gridDim(acc)
+#define blockDim cupla::blockDim(acc)
+#define elemDim cupla::threadDim(acc)
 
 /** Atomic functions
  *
@@ -106,17 +89,17 @@
  *
  * @{
  */
-#define atomicAdd(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc, __VA_ARGS__)
-#define atomicSub(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Sub>(acc, __VA_ARGS__)
-#define atomicMin(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Min>(acc, __VA_ARGS__)
-#define atomicMax(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Max>(acc, __VA_ARGS__)
-#define atomicInc(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Inc>(acc, __VA_ARGS__)
-#define atomicDec(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Dec>(acc, __VA_ARGS__)
-#define atomicExch(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Exch>(acc, __VA_ARGS__)
-#define atomicCAS(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Cas>(acc, __VA_ARGS__)
-#define atomicAnd(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::And>(acc, __VA_ARGS__)
-#define atomicXor(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Xor>(acc, __VA_ARGS__)
-#define atomicOr(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Or>(acc, __VA_ARGS__)
+#define atomicAdd(...) cupla::atomicAdd(acc, __VA_ARGS__)
+#define atomicSub(...) cupla::atomicSub(acc, __VA_ARGS__)
+#define atomicMin(...) cupla::atomicMin(acc, __VA_ARGS__)
+#define atomicMax(...) cupla::atomicMax(acc, __VA_ARGS__)
+#define atomicInc(...) cupla::atomicInc(acc, __VA_ARGS__)
+#define atomicDec(...) cupla::atomicDec(acc, __VA_ARGS__)
+#define atomicExch(...) cupla::atomicExch(acc, __VA_ARGS__)
+#define atomicCAS(...) cupla::atomicCAS(acc, __VA_ARGS__)
+#define atomicAnd(...) cupla::atomicAnd(acc, __VA_ARGS__)
+#define atomicXor(...) cupla::atomicXor(acc, __VA_ARGS__)
+#define atomicOr(...) cupla::atomicOr(acc, __VA_ARGS__)
 /** @} */
 
 #define uint3 ::cupla::uint3
diff --git a/include/cupla/device/Atomic.hpp b/include/cupla/device/Atomic.hpp
new file mode 100644
index 00000000000..5645a7a0f50
--- /dev/null
+++ b/include/cupla/device/Atomic.hpp
@@ -0,0 +1,176 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/datatypes/uint.hpp"
+#include "cupla/device/Hierarchy.hpp"
+#include "cupla/types.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+namespace cupla
+{
+inline namespace device
+{
+
+#define CUPLA_UNARY_ATOMIC_OP(functionName, alpakaOp)                          \
+        /*!                                                                    \
+         * Compared to their CUDA/HIP counterparts, these functions take an additional last \
+         * parameter to denote atomicity (synchronization) level. This parameter is \
+         * of type cupla::hierarchy::{Grids|Blocks|Threads}. Grids corresponds \
+         * to atomicity between different kernels, Blocks - to different blocks \
+         * in the same grid/kernel, Threads - to threads of the same block.    \
+         * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*] \
+         * @tparam T_Acc alpaka accelerator [alpaka::acc::*]                   \
+         * @tparam T_Type type of the value                                    \
+         * @param acc alpaka accelerator                                       \
+         * @param ptr destination pointer                                      \
+         * @param value source value                                           \
+         * @{                                                                  \
+         */                                                                    \
+        template<                                                              \
+            typename T_Hierarchy,                                              \
+            typename T_Acc,                                                    \
+            typename T_Type                                                    \
+        >                                                                      \
+        ALPAKA_FN_ACC ALPAKA_FN_INLINE                                         \
+        void functionName(                                                     \
+            T_Acc const & acc,                                                 \
+            T_Type *ptr,                                                       \
+            T_Type const & value                                               \
+        )                                                                      \
+        {                                                                      \
+            ::alpaka::atomic::atomicOp< alpakaOp >(                            \
+                acc,                                                           \
+                ptr,                                                           \
+                value,                                                         \
+                T_Hierarchy{}                                                  \
+            );                                                                 \
+        }                                                                      \
+                                                                               \
+        /*! @param hierarchy hierarchy level within the operation is atomic    \
+         */                                                                    \
+        template<                                                              \
+            typename T_Acc,                                                    \
+            typename T_Type,                                                   \
+            typename T_Hierarchy = alpaka::hierarchy::Grids                    \
+        >                                                                      \
+        ALPAKA_FN_ACC ALPAKA_FN_INLINE                                         \
+        void functionName(                                                     \
+            T_Acc const & acc,                                                 \
+            T_Type *ptr,                                                       \
+            T_Type const & value,                                              \
+            T_Hierarchy const & hierarchy = T_Hierarchy()                      \
+        )                                                                      \
+        {                                                                      \
+            functionName< T_Hierarchy >(                                       \
+                acc,                                                           \
+                ptr,                                                           \
+                value                                                          \
+            );                                                                 \
+        }                                                                      \
+        /*!@}                                                                  \
+         */
+
+        /// atomic addition
+        CUPLA_UNARY_ATOMIC_OP( atomicAdd, ::alpaka::atomic::op::Add )
+        /// atomic subtraction
+        CUPLA_UNARY_ATOMIC_OP( atomicSub, ::alpaka::atomic::op::Sub )
+        /// atomic minimum
+        CUPLA_UNARY_ATOMIC_OP( atomicMin, ::alpaka::atomic::op::Min )
+        /// atomic maximum
+        CUPLA_UNARY_ATOMIC_OP( atomicMax, ::alpaka::atomic::op::Max )
+        /// atomic increment
+        CUPLA_UNARY_ATOMIC_OP( atomicInc, ::alpaka::atomic::op::Inc )
+        /// atomic decrement
+        CUPLA_UNARY_ATOMIC_OP( atomicDec, ::alpaka::atomic::op::Dec )
+        /// atomic bit-wise and
+        CUPLA_UNARY_ATOMIC_OP( atomicAnd, ::alpaka::atomic::op::And )
+        /// atomic bit-wise or
+        CUPLA_UNARY_ATOMIC_OP( atomicOr, ::alpaka::atomic::op::Or )
+        /// atomic exchange
+        CUPLA_UNARY_ATOMIC_OP( atomicExch, ::alpaka::atomic::op::Exch )
+        /// atomic bit-wise xor
+        CUPLA_UNARY_ATOMIC_OP( atomicXor, ::alpaka::atomic::op::Xor )
+
+#undef CUPLA_UNARY_ATOMIC_OP
+
+        /** atomic compare and swap
+         *
+         * @{
+         * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*]
+         * @tparam T_Acc alpaka accelerator [alpaka::acc::*]
+         * @tparam T_Type type of the value
+         * @param acc alpaka accelerator
+         * @param ptr destination pointer
+         * @param value source value
+         */
+        template<
+            typename T_Hierarchy,
+            typename T_Acc,
+            typename T_Type
+        >
+        ALPAKA_FN_ACC ALPAKA_FN_INLINE
+        void atomicCas(
+            T_Acc const & acc,
+            T_Type *ptr,
+            T_Type const & compare,
+            T_Type const & value
+        )
+        {
+            ::alpaka::atomic::atomicOp< ::alpaka::atomic::op::Cas >(
+                acc,
+                ptr,
+                compare,
+                value,
+                T_Hierarchy{}
+            );
+        }
+
+        /*! @param hierarchy hierarchy level within the operation is atomic
+         */
+        template<
+            typename T_Acc,
+            typename T_Type,
+            typename T_Hierarchy = hierarchy::Grids
+        >
+        ALPAKA_FN_ACC ALPAKA_FN_INLINE
+        void atomicCas(
+            T_Acc const & acc,
+            T_Type *ptr,
+            T_Type const & compare,
+            T_Type const & value,
+            T_Hierarchy const & hierarchy = T_Hierarchy()
+        )
+        {
+            atomicCas< T_Hierarchy >(
+                acc,
+                ptr,
+                compare,
+                value
+            );
+        }
+        /*!@}
+         */
+
+} // namespace device
+} // namespace cupla
diff --git a/include/cupla/device/Hierarchy.hpp b/include/cupla/device/Hierarchy.hpp
new file mode 100644
index 00000000000..130faa31be9
--- /dev/null
+++ b/include/cupla/device/Hierarchy.hpp
@@ -0,0 +1,43 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/types.hpp"
+
+#include <alpaka/core/Positioning.hpp>
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+namespace hierarchy
+{
+
+    //! hierarchy definitions for atomic operation
+    using namespace ::alpaka::hierarchy;
+
+} // namespace layer
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/include/cupla/device/Index.hpp b/include/cupla/device/Index.hpp
new file mode 100644
index 00000000000..2f90572f4f3
--- /dev/null
+++ b/include/cupla/device/Index.hpp
@@ -0,0 +1,120 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/datatypes/uint.hpp"
+#include "cupla/types.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+namespace cupla
+{
+inline namespace device
+{
+
+    /** number of blocks within the grid layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::acc::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 gridDim( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::workdiv::getWorkDiv<
+                ::alpaka::Grid,
+                ::alpaka::Blocks
+            >( acc )
+        );
+    }
+
+    /** number of threads within the block layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::acc::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 blockDim( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::workdiv::getWorkDiv<
+                ::alpaka::Block,
+                ::alpaka::Threads
+            >( acc )
+        );
+    }
+
+    /** number of elements within the thread layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::acc::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 threadDim( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::workdiv::getWorkDiv<
+                ::alpaka::Thread,
+                ::alpaka::Elems
+            >( acc )
+        );
+    }
+
+    /** index of the thread within the block layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::acc::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 threadIdx( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::idx::getIdx<
+                ::alpaka::Block,
+                ::alpaka::Threads
+            >( acc )
+        );
+    }
+
+    /** index of the block within the grid layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::acc::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 blockIdx( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::idx::getIdx<
+                ::alpaka::Grid,
+                ::alpaka::Blocks
+            >( acc )
+        );
+    }
+
+} // namespace device
+} // namespace cupla
diff --git a/include/cupla/device/SharedMemory.hpp b/include/cupla/device/SharedMemory.hpp
new file mode 100644
index 00000000000..a04169eb22f
--- /dev/null
+++ b/include/cupla/device/SharedMemory.hpp
@@ -0,0 +1,32 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+
+#define sharedMem(ppName, ...)                                                 \
+     __VA_ARGS__& ppName =                                                     \
+        ::alpaka::block::shared::st::allocVar< __VA_ARGS__, __COUNTER__ >( acc )
+
+#define sharedMemExtern(ppName, ...)                                           \
+    __VA_ARGS__* ppName =                                                      \
+        ::alpaka::block::shared::dyn::getMem< __VA_ARGS__ >( acc )
diff --git a/include/cupla/device/Synchronization.hpp b/include/cupla/device/Synchronization.hpp
new file mode 100644
index 00000000000..e24e37b4624
--- /dev/null
+++ b/include/cupla/device/Synchronization.hpp
@@ -0,0 +1,57 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/types.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+namespace cupla
+{
+inline namespace device
+{
+
+    /** synchronize threads within the block
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::acc::*]
+     * @param acc alpaka accelerator
+     *
+     * @{
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    void syncThreads( T_Acc const & acc )
+    {
+        ::alpaka::block::sync::syncBlockThreads( acc );
+    }
+
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    void __syncthreads( T_Acc const & acc )
+    {
+        syncThreads( acc );
+    }
+
+    //!@}
+
+} // namespace device
+} // namespace cupla
diff --git a/include/cupla/device_functions.hpp b/include/cupla/device_functions.hpp
new file mode 100644
index 00000000000..56fc6af01d2
--- /dev/null
+++ b/include/cupla/device_functions.hpp
@@ -0,0 +1,27 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/Synchronization.hpp"
+#include "cupla/device/Index.hpp"
+#include "cupla/device/Atomic.hpp"
+#include "cupla/device/SharedMemory.hpp"
diff --git a/include/cupla/kernel.hpp b/include/cupla/kernel.hpp
index d0d27506f7c..df287f170f4 100644
--- a/include/cupla/kernel.hpp
+++ b/include/cupla/kernel.hpp
@@ -172,6 +172,10 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
     /* Kernel configuration interface with element support
      *
      * The kernel must support the alpaka element layer.
+     *
+     * Swap the blockSize and the elemSize depending on the activated accelerator.
+     * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) )
+     * and the elemSize is set to the user defined blockSize
      */
     template<
         typename T_KernelType
@@ -201,11 +205,8 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
     };
 
     /** Kernel configuration interface with element support
-     * The kernel must support the alpaka element level
      *
-     * Swap the blockSize and the elemSize depending on the activated accelerator.
-     * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) )
-     * and the elemSize is set to the user defined blockSize
+     * The kernel must support the alpaka element level
      */
     template<
         typename T_KernelType